def createName(): mynames = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(mynames) firstname = str(mynames[0][0]).replace(' ','') return firstname
def demo(): def gender_features(word): return {"last_letter": word[-1], "penultimate_letter": word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = [(name, "male") for name in names.words("male.txt")] + [ (name, "female") for name in names.words("female.txt") ] import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n, g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print "--- nltk.classify.svm demo ---" print "Number of training examples:", len(train_set) classifier = SvmClassifier.train(train_set) print "Total SVM dimensions:", len(classifier._svmfeatureindex) print "Label mapping:", classifier._labelmapping print "--- Processing an example instance ---" print "Reference instance:", names[0] print "NLTK-format features:\n " + str(test_set[0]) print "SVMlight-format features:\n " + str( map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex) ) distr = classifier.prob_classify(test_set[0][0]) print "Instance classification and confidence:", distr.max(), distr.prob(distr.max()) print "--- Measuring classifier performance ---" print "Overall accuracy:", accuracy(classifier, test_set)
def __init__(self,logger=None,loglevel=logging.INFO): if(logger is None): self.logger=createLog(logname="subtitle",level=loglevel) else: self.logger=logger self.logger.info("\n-----------------") self.logger.info("Subtitle begin to init") self.logger.info("\n-----------------") self.files=[] self.lexicon=set() self.stem_lexicon=set() self.noUsed=set(["-","","'","“","—","”"]) #self.suffix=set(["'s","'d","'ve"]) self.newWords=None self.wordSet=None self.stem_newWords=None self.checkup=False self.punctuation=r".?\[\]!,\":%;()|^=+\/\\_`\*;.:><" #self.addPunctuation([',','!',';','.',':','>','<']) self.raw="" self.lexicon_path=None self.nameSet=set(names.words('male.txt')+names.words('female.txt')) pass
def semanticClassify(self, s): """ 对分段进行语义分类,仅动词和名词具有语义标签,需要先进行POS标记 Input: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')] Output: [('i', 'PRON', ' '), ('love', 'VERB', 'love.n.01'), ('you', 'PRON', ' ')] """ classified_seg = [] for seg in s: male_name = [w.lower() for w in names.words('male.txt')] female_name = [w.lower() for w in names.words('female.txt')] month = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] if seg[1] == 'NP': if seg[0] in male_name: classified_seg.append((seg[0], seg[1], 'male_name')) elif seg[0] in female_name: classified_seg.append((seg[0], seg[1], 'female_name')) elif seg[0] in month: classified_seg.append((seg[0], seg[1], 'month')) else: classified_seg.append((seg[0], seg[1], ' ')) elif (seg[1] == 'VERB' or seg[1] == 'NOUN'): classified = wn.synsets(seg[0]) if len(classified) > 0: classified_seg.append( (seg[0], seg[1], classified[0].name())) else: classified_seg.append((seg[0], seg[1], ' ')) else: classified_seg.append((seg[0], seg[1], ' ')) return self.encodeutf8(classified_seg)
def demo(): def gender_features(word): return {'last_letter': word[-1], 'penultimate_letter': word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n,g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print '--- nltk.classify.svm demo ---' print 'Number of training examples:', len(train_set) classifier = SvmClassifier.train(train_set) print 'Total SVM dimensions:', len(classifier._svmfeatureindex) print 'Label mapping:', classifier._labelmapping print '--- Processing an example instance ---' print 'Reference instance:', names[0] print 'NLTK-format features:\n ' + str(test_set[0]) print 'SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)) distr = classifier.prob_classify(test_set[0][0]) print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max()) print '--- Measuring classifier performance ---' print 'Overall accuracy:', accuracy(classifier, test_set)
def main(): from nltk.corpus import names names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) train_names = names[1500:] devtest_names = names[500:1500] test_names = names[:500] train_set = [(gender_features(n), g) for (n,g) in train_names] devtest_set = [(gender_features(n), g) for (n,g) in devtest_names] classifier = nltk.NaiveBayesClassifier.train(train_set) print classifier.classify(gender_features('Neo')) print classifier.classify(gender_features('Trinity')) print 'attila:', classifier.classify(gender_features('Attila')) print classifier.classify(gender_features('Bori')) print classifier.classify(gender_features('Gabi')) print 'andy:', classifier.classify(gender_features('Andy')) print 'dom:', classifier.classify(gender_features('Dom')) print 'monica:', classifier.classify(gender_features('Monica')) print 'donnie:', classifier.classify(gender_features('Donald')) print "accuracy:", nltk.classify.accuracy(classifier, devtest_set) print classifier.show_most_informative_features(5) errors = [] for (name, tag) in devtest_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name)) for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
def __calculateAgreement(self): if len(self.np) == 1: if self.np[0,0] in names.words('male.txt'): self.gender = 'male' elif self.np[0,0] in names.words('female.txt'): self.gender = 'female' if {'NNS', 'NNPS'}.intersection({b for (a, b) in self.np.pos()}) or {',','and'}.intersection(self.np.leaves()): self.number = {'plural'} else: self.number = {'singular'} if 'PRP' in self.np[0].label(): if self.np[0,0].lower() in {'they', 'them', 'themselves', 'their'}: self.number = {'plural'} elif self.np[0,0].lower() in {'him', 'he', 'himself'}: self.gender = 'male' self.number = {'singular'} elif self.np[0,0].lower() in {'her', 'herself' , 'she'}: self.number = {'singular'} self.gender = 'female' elif self.np[0,0].lower() in {'it', 'itself'}: self.number = {'singular'} elif self.np[0,0].lower() in {'us', 'we', 'our', 'ourselves'}: self.number = {'plural'} self.person = 'first' elif self.np[0,0].lower() in {'I', 'me', 'my', 'myself'}: self.number = {'singular'} self.person = 'first' elif self.np[0,0].lower() in {'yourself'}: self.number = {'singular'} self.person = 'second' elif self.np[0,0].lower() in {'you', 'your'}: self.number = {'singular', 'plural'} self.person = 'second' elif self.np[0,0].lower() in {'yourselves'}: self.number = {'plural'} self.person = 'second'
def new_naive_bayes_classifier(): # Create feature set consisting of male and female names for training global CLASSIFIER_CACHE if CLASSIFIER_CACHE: return CLASSIFIER_CACHE else: male_word_seq = _new_training_set( 'male', names.words('male.txt'), MALE_PRONOUN_SEQ) female_word_seq = _new_training_set( 'female', names.words('female.txt'), FEMALE_PRONOUN_SEQ) neutral_pronoun_seq = _new_training_set( 'neutral', NEUTRAL_PRONOUN_SEQ) excess_seq = _new_training_set( 'excess', ABBREVIATION_SEQ, PREPOSITION_SEQ, string.punctuation, ('looking', 'is'), ) featureset_seq = ( (_gender_features(word), gender) for word, gender in chain( male_word_seq, female_word_seq, neutral_pronoun_seq, excess_seq, )) CLASSIFIER_CACHE = nltk.NaiveBayesClassifier.train(featureset_seq) return CLASSIFIER_CACHE
def initGenderClassifier(): """Initialize gender classifier""" from nltk.corpus import names names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) featuresets = [(gender_features(n), g) for (n,g) in names] return nltk.NaiveBayesClassifier.train(featuresets)
def gender(word): """Method to determine the gender of given word by comparing it to name dictionaries. Args: word (str): Word. (usually a name) Keyword Args: is_server (bool): Is Dragonfire running as an API server? user_id (int): User's ID. Returns: str: Male or Female .. note:: This method is a very naive and not very useful. So it will be deprecated in the future. """ labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) shuffle(labeled_names) featuresets = [(Classifier.gender_features(n), gender) for (n, gender) in labeled_names] train_set = featuresets[500:] classifier = nltk.NaiveBayesClassifier.train(train_set) return classifier.classify(Classifier.gender_features(word))
def make_classifier(): from nltk.corpus import names training_names = [(name, 'male') for name in names.words('male.txt')] + \ [(name, 'female') for name in names.words('female.txt')] feature_sets = [(name_features(name), gender) for (name, gender) in training_names] classifier = nltk.NaiveBayesClassifier.train(feature_sets) return classifier
def get_variations(s): base = s.split() variations = [] for n in base: if n in names.words(): variations.append(n) if s in names.words(): variations.append(s) return variations
def create_featuresets(self): ''' Create featuresets of name, gender based on the names corpora ''' train_names = ([(name,'male') for name in names.words('male.txt')] + [(name,'female') for name in names.words('female.txt')]) random.shuffle(train_names) return [(self.gender_features(n), g) for (n,g) in train_names]
def feature_nameList(word): if word in names.words('male.txt'): return 1 elif word in names.words('female.txt'): return 1 elif GeoText(word): return 1 else: return 0
def identify_gender3(): import random from nltk.corpus import names names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) featuresets = [(gender_features3(n), g) for n, g in names] return classify(nltk.NaiveBayesClassifier, featuresets, 500)
def nltkTest(): labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.shuffle(labeled_names) featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) val = classifier.classify(gender_features('Neo')) print val
def __init__(self): super(Gender, self).__init__() labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set = featuresets #print train_set[0] #Training data for the BayerClassifier self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def main(): name = ([(n, 'male') for n in names.words('male.txt')] + [(n, 'female') for n in names.words('female.txt')]) randomNames = random.shuffle(name) featuresets = [(genderFeature(n.lower()), g) for (n, g) in name] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) string = raw_input("Enter a name: ") print classifier.classify(genderFeature(string)) print nltk.classify.accuracy(classifier, test_set)
def classify(self, name): feats = self._nameFeatures(name) # print(name, feats) for male in names.words('male.txt'): if name == male: return 'M' for female in names.words('female.txt'): if name == female: return 'F' return self.classifier.classify(feats)
def feature_nameList(word): for name_m in names.words('male.txt'): if word[0].decode('unicode-escape') == name_m.decode('unicode-escape'): return 1 for name_m in names.words('female.txt'): if word[0].decode('unicode-escape') == name_m.decode('unicode-escape'): return 1 if len(GeoText(word[0]).cities)>0 or len(GeoText(word[0]).countries)>0: return 1 else: return 0
def partial_names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random male_names = names.words('male.txt') female_names = names.words('female.txt') random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] \ + [(name, False) for name in female_names[500:750]] random.shuffle(test) # Train up a classifier. print 'Training classifier...' classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print 'Testing classifier...' acc = accuracy(classifier, [(features(n),m) for (n,m) in test]) print 'Accuracy: %6.4f' % acc # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n,m) in test] pdists = classifier.batch_prob_classify(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) print print 'Unseen Names P(Male) P(Female)\n'+'-'*40 for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print fmt % (name, pdist.prob(True), pdist.prob(False)) except NotImplementedError: pass # Return the classifier return classifier
def __determine_gender__(article, sentence, token, start_index, end_index, entity_type): """WORKS""" if entity_type == "PER": if token in PRONOUN_LIST: if token in ["he","his"]: return "male" elif token in ["she","her"]: return "female" elif token.startswith("Mr.") or token.split("_")[0] in names.words("male.txt"): return "male" elif token.startswith("Mrs.") or token.split("_")[0] in names.words("female.txt"): return "female" return "unknown"
def getFeatures(self): maleNames = (name for name in names.words('male.txt')) femaleNames = (name for name in names.words('female.txt')) featureset = list() for name in maleNames: features = self._nameFeatures(name) featureset.append((features, 'M')) for name in femaleNames: features = self._nameFeatures(name) featureset.append((features, 'F')) return featureset
def __init__(self, corpus, outfile, tokens_dir, parses_dir, depparses_dir, train=False): self.relations = list() self.train = train self.corpus = corpus self.outfile = outfile self.tokenized_sents, self.tok_sents_pos = self.process_tokens_dir(tokens_dir) self.parses = self.process_parses_dir(parses_dir) self.depparses = self.process_dparses_dir(depparses_dir) self.clusterdict = self.make_cluster_dict('50mpaths2') self.pronouns = ["I", "me", "my", "mine", "myself", "you", "your", "yours", "yourself", "he", "him", "his", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "we", "us", "our", "ours", "ourselves", "you", "your", "yours", "yourselves", "they", "them", "their", "theirs", "themselves"] self.locations = set([c.lower() for c in gazetteers.words('countries.txt')] + [s.lower() for s in gazetteers.words('usstates.txt')]) self.names = set([name.lower() for name in names.words('male.txt')] + [name.lower() for name in names.words('female.txt')]) self.feat_fns = [self.words, #good self.word_types, #good self.pronoun, #good self.name, #good #self.place, #look to get a better list self.num_words_between, #good self.words_between_words, #good self.prev_word, #good #self.post_word, #really bad feature #self.prev_word_pos, #bad self.post_word_pos, #good self.first_word_after_w1, #good self.words_between_POSs, #good #self.last_word_before_w2 self.w1clust, #good self.w2clust, #good self.tree_path, #self.w1pref, #bad #self.w1suf, #self.w2pref, #self.w2suf, #self.w1bow, #self.w2bow self.et1dw1, self.et2dw2, self.h1dw1, self.h2dw2 ]
def add_sample(self, sample): if not isinstance(sample, str): raise TypeError # Calling add_sample should replace existing sample. # To avoid appending new values onto existing lists: self.sample = sample self.misspelled_words = [] self.tokenized_sample = [] self.tagged_sample = {} sample = sample.replace('\n', " ") sample = sample.rstrip(" ") for char in punctuation.replace("'", ""): sample = sample.replace(char, "") tokens = word_tokenize(sample) for word in tokens: if word.lower() in words.words(): self.tokenized_sample.append(word) elif word.capitalize() in names.words(): continue elif "'" in word: self.tokenized_sample.append(word) elif LEMMATIZER.lemmatize(word.lower()) not in words.words(): if STEMMER.stem(word.lower()) not in words.words(): self.misspelled_words.append(word) else: self.tokenized_sample.append(word) self.tagged_sample = pos_tag(tokens)
def posTagging(self, s): """ 对一个分段进行POS标记 input: ['i','love','you'] output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')] """ brown_tagged_sents = brown.tagged_sents( tagset='universal', categories='news') default_tagger = nltk.DefaultTagger('NN') month = [u'january', u'february', u'march', u'april', u'may', u'june', u'july', u'august', u'september', u'october', u'november', u'december'] np_words = [w.lower() for w in names.words()] + month np_tags = dict((word, 'NP') for word in np_words) np_tagger = nltk.UnigramTagger( model=np_tags, backoff=default_tagger) brown_unigram_tagger = nltk.UnigramTagger( brown_tagged_sents, backoff=np_tagger) brown_bigram_tagger = nltk.BigramTagger( brown_tagged_sents, backoff=brown_unigram_tagger) brown_trigram_tagger = nltk.TrigramTagger( brown_tagged_sents, backoff=brown_bigram_tagger) patterns = [(r'\bi\b', 'PRON')] regexp_tagger = nltk.RegexpTagger( patterns, backoff=brown_trigram_tagger) result = regexp_tagger.tag(s) return self.encodeutf8(result)
def get_hosts(year): '''Hosts is a list of one or more strings. Do NOT change the name of this function or what it returns.''' # Your code here file_name = 'gg%s.json' % year with open(file_name, 'r') as data: db = json.load(data) events = get_pn_vec_from_range_for_hosts(db2013) hosts = [] host = 0 for item in events[0].most_common(100): skip = False #print item if host > 1: break for i in item[0]: if i in IGNORE_WORDS: skip = True break if skip: continue if item[0][0] in map(lambda x: x.lower(), names.words()) or item[0][1] in map(lambda x: x.lower(), names.words()): hosts.append(' '.join(word for word in item[0])) host = host + 1 return hosts
def pre_filter(iter): nameswords = set([word.lower() for word in names.words()]) def replace(s): return ' '.join(['he' if x in nameswords else x for x in s.split()]) for i, line in enumerate(iter): if (i%100000==0) and str(mp.current_process().name.strip()) == "PoolWorker-1": out_error("Processed " + str(i*mp.cpu_count()) + " lines.", False) yield [replace(c) for c in line]
def naive_bayes_gender_classifier(): from nltk.corpus import names names = ([(name, "male") for name in names.words("male.txt")] + [(name, "female") for name in names.words("female.txt")]) random.shuffle(names) # featuresets = [(_gender_features(n), g) for (n,g) in names] # train_set, test_set = featuresets[500:], featuresets[:500] # advisable to stream the sets in for large data set. train_set = apply_features(_gender_features, names[500:]) test_set = apply_features(_gender_features, names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print "Neo is ", classifier.classify(_gender_features("Neo")) print "Trinity is", classifier.classify(_gender_features("Trinity")) # calculate the accuracy of the classifier print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
def initClassifier(self): # Set the nltk_data path so that we load the logisland mebedded corpus nltk.data.path.insert(0, nltk_data_path) # Loading gender data print "Loading gender data..." labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) print "Loaded " + str(len(labeled_names)) + " samples" # Train classifier with data print "Training gender classifier..." featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] self.classifier = nltk.classify.NaiveBayesClassifier.train(featuresets) print "Gender classifier trained"
def clean_test(docs): cleaned = [] all_names = set(x.lower() for x in names.words()) # for all the name in the corpus lemmatizer = WordNetLemmatizer() for doc in docs: cleaned.append(' '.join( lemmatizer.lemmatize(word.lower()) for word in doc.split() if letters_only(word) and word.lower() not in all_names)) return cleaned
def get_gender_predictions(name_input): """ Get Prediction """ try: train_data = ([(name, 'male') for name in names.words('male.txt')] \ + [(name, 'female') for name in names.words('female.txt')]) mix_dataset = random.shuffle(train_data) featuresets = [(gender_features(n), gender) for (n, gender) in train_data] # classifier = nltk.NaiveBayesClassifier.train(featuresets) total_record = len(train_data) train_set, test_set = featuresets[ int(total_record * .99):], featuresets[:int(total_record * .01)] classifier = nltk.NaiveBayesClassifier.train(train_set) prediction = classifier.classify(gender_features(name_input)) except: prediction = "NA" return prediction
def tag_speakers_list(self, speakers): matched_speakers = 0 for value in speakers: regex = r'' + re.escape(value) + r'' results = re.findall(regex, self.abstract) for result in results: self.tags.append((result, "speaker")) matched_speakers += 1 #todo improve this if matched_speakers == 0: tagged = self.pos l = len(tagged) for index, word in enumerate(tagged): if index < (l - 1): #todo make this better and use regex if word[1] == u'NP' and tagged[index + 1][1] == u'NP': self.tags.append( ((word[0] + " " + tagged[index + 1][0]), "speaker")) elif word[1] == u'NP' and tagged[index + 1][1] == "unseen": if tagged[index + 1][0] in names.words(): self.tags.append( ((word[0] + " " + tagged[index + 1][0]), "speaker")) elif word[1] == "unseen" and tagged[index + 1][1] == u'NP': if word[0] in names.words(): self.tags.append( ((word[0] + " " + tagged[index + 1][0]), "speaker")) elif word[1] == "unseen" and tagged[index + 1][1] == "unseen": if word[0] in names.words() and tagged[ index + 1][0] in names.words(): self.tags.append( ((word[0] + " " + tagged[index + 1][0]), "speaker"))
def __init__(self): """initiates with the data and all available NB classifier in sklearn""" self.data = [(w.strip(), "M") for w in names.words("male.txt") ] + [(w.strip(), "F") for w in names.words("female.txt")] self.clf1 = BernoulliNB() self.clf2 = CategoricalNB() self.clf3 = ComplementNB() self.clf4 = GaussianNB() self.clf5 = MultinomialNB() self.clf6 = DecisionTreeClassifier() self.clf7 = ExtraTreeClassifier() self.clf8 = SVC() self.clfList = [ self.clf1, self.clf2, self.clf3, self.clf4, self.clf5, self.clf6, self.clf7, self.clf8 ] self.FeatureFuncList = [ self.F01, self.F02, self.F03, self.F04, self.F05, self.F06 ]
def get_people(tweets): men = dict() women = dict() for tweet in tweets: words = [nltk.word_tokenize(tweet)] tagged_words = [nltk.pos_tag(word) for word in words][0] for chunk in nltk.ne_chunk(tagged_words): if type(chunk) == nltk.tree.Tree: # Adele needs a last name but other than her its fine to look for first and last if chunk.label( ) == 'PERSON' and len(chunk) > 1 and len(chunk) < 3: name = (' '.join([c[0] for c in chunk])) first = name.split(' ', 1)[0] if first in names.words('male.txt'): men[name] = men.get(name, 0) + 1 if first in names.words('female.txt'): women[name] = women.get(name, 0) + 1 return men, women
def cleaned_text(email): all_names = set(x.lower() for x in names.words()) lemmatizer = WordNetLemmatizer() cleaned = [] for messages in email: cleaned.append(' '.join( lemmatizer.lemmatize(word.lower()) for word in messages.split() if letters_only(word) and word.lower() not in all_names)) return cleaned
def gender_identify(self, word, isPrint): # featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] # train_set, test_set = featuresets[500:], featuresets[:500] labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) train_set = apply_features(self.gender_features, labeled_names[500:]) test_set = apply_features(self.gender_features, labeled_names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) if isPrint: print("gender recognise accuracy is " + str(nltk.classify.accuracy(classifier, test_set))) return classifier.classify(self.gender_features(word))
def clean_text(self, docs): all_names = set(names.words()) lemmatizer = WordNetLemmatizer() cleaned_docs = [] for doc in docs: cleaned_docs.append(' '.join([ lemmatizer.lemmatize(word.lower()) for word in doc.split() if self.letters_only(word) and word not in all_names ])) return cleaned_docs
def gender_Classfier(): labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] # test_set = [(gender_features(n), gender) for (n, gender) in devtest_names] train_set = featuresets[:] classifier = nltk.NaiveBayesClassifier.train(train_set) # print(nltk.classify.accuracy(classifier, test_set)) # print(classifier.classify(gender_features('Neo'))) return classifier
def __init__(self): self.train_path = "../data/train" self.dev_path = "../data/dev" self.beta = 0 self.max_iter = 0 # self.classifier = None self.dict_classifiers = {} self.locations = set(gazetteers.words()) self.names = set(names.words()) self.pos = None self.previous_labels = None
def glossary_filter(g): if g.entry_list == []: return False if g.freq == 0 and (len(g.lemmas[0]) < 4 or (g.pos_list == ['noun'] and g.lemmas[0] in names.words())): return False for pos in pos_ignore_list: if pos in g.pos_list: return False for pos in g.tokens_pos: if 'NNP' in pos: return False return True
def clean_text(docs): all_names = set(names.words()) cl_doc = [] lemm = WordNetLemmatizer() for doc in docs: cl_doc.append(" ".join([ lemm.lemmatize(word.lower()) for word in doc.split() if letter_only(word) and word not in all_names ])) return cl_doc
def clean_text(docs): all_names = set(names.words()) lemmatizer = WordNetLemmatizer() cleaned_docs = [] for doc in docs: cleaned_docs.append(' '.join([ lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word not in all_names ])) return cleaned_docs
def preprocessing(data): all_names = set(names.words()) lemmatizer = WordNetLemmatizer() data_cleaned = [] for doc in data: doc_cleaned = ' '.join( lemmatizer.lemmatize(word) for word in doc.split() if is_letter(word) and word not in all_names) data_cleaned.append(doc_cleaned) print('preprocessing complete') return data_cleaned
def common_unigrams(): ''' Gets the unique words in several corpora in a set. :returns: The unique words. ''' sw = set(lower_all(brown.words())) sw = sw.union(set(lower_all(names.words()))) sw = sw.union(set(lower_all(words.words()))) sw = sw.union(set(lower_all(reuters.words()))) return sw
def clean_text(docs): cleaned_docs: list = [] all_names = set(names.words()) lemmatizer = WordNetLemmatizer() for doc in docs: cleaned_docs.append(" ".join([ lemmatizer.lemmatize(word.lower()) for word in doc.split() if letters_only(word) and word not in all_names ])) return cleaned_docs
class EmailAddress(object): word_list = words.words(fileids=['en']) name_list = names.words(fileids=['female.txt']) urls = [] emails = [] def __init__(self, address_count): for n in range((address_count // 3) + 1): self.urls.append(self.gen_url()) for n in range(address_count): self.emails.append("@".join( [self.gen_username(), random.choice(self.urls)])) return def __str__(self): all_emails = "\n".join(self.emails) return (all_emails) def __iter__(self): for email in self.emails: yield (email) def sample_emails(self, count): return (random.sample(self.emails, count)) def weighted_choice(self, choices): """simple weighted selection from 'dict[k] = v' where v is int""" weight_total = sum(choices.values()) rand_val = random.uniform(0, weight_total) test_val = 0 for k in choices: if test_val + choices[k] >= rand_val: return (k) test_val += choices[k] def gen_url(self): """generates a single, random URL with dns.tld structure""" return (".".join([self.gen_dns(), self.gen_tld()])) def gen_dns(self): """generates a single, random word to create a fake dns entry""" dns = random.choice(self.word_list).lower() return (dns) def gen_tld(self): # original tlds = ['com', 'org', 'net', 'int', 'edu', 'gov', 'mil'] tlds = {'com': 5, 'net': 2, 'org': 1} return (self.weighted_choice(tlds)) def gen_username(self): return (random.choice(self.name_list).lower())
def clean_text(docs): Lemmatizer = WordNetLemmatizer() all_words = set(names.words()) cleaned_data = [] for doc in docs: cleaned_data.append(' '.join([ Lemmatizer.lemmatize(word.lower()) for word in doc.split() if letter_only(word) and word not in all_words ])) return cleaned_data
def pre_filter(iter): nameswords = set([word.lower() for word in names.words()]) def replace(s): return ' '.join(['he' if x in nameswords else x for x in s.split()]) for i, line in enumerate(iter): if (i % 100000 == 0) and str( mp.current_process().name.strip()) == "PoolWorker-1": out_error("Processed " + str(i * mp.cpu_count()) + " lines.", False) yield [replace(c) for c in line]
def frequency(self,text): sent=self.tokenize(text) string="" for i in sent: if i not in stopwords.words('english')+names.words()+Li+CC+Li1: las=LancasterStemmer() temp=las.stem(i) lemma = nltk.wordnet.WordNetLemmatizer() lemma.lemmatize(temp) string+=str(temp+" ") return string
def _predict_gender_init(): try: male_pth = os.path.abspath('male_nms.txt') female_pth = os.path.abspath('female_nms.txt') except OSError: male_pth = os.path.abspath('male.txt') female_pth = os.path.abspath('female.txt') labeled_names = ([(name.lower(), 'male') for name in names.words(male_pth)] + [(name.lower(), 'female') for name in names.words(female_pth)]) random.shuffle(labeled_names) # we use the feature extractor to process the names data. train_set = [(gender_features_2(n), gender) for (n, gender) in labeled_names] # The training set is used to train a new "naive Bayes" classifier. return nltk.NaiveBayesClassifier.train(train_set)
def main(): #Dodavanje imana labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) #Raspodjela imena 1 featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] #Raspodjela imena 2 train_names = labeled_names[1500:] devtest_names = labeled_names[500:1500] test_names = labeled_names[:500] train_set = [(gender_features(n), gender) for (n, gender) in train_names] devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names] test_set = [(gender_features(n), gender) for (n, gender) in test_names] #Treniranje klasifikatora classifier = nltk.NaiveBayesClassifier.train(train_set) #Ispis Rezultata print(gender_features2("John")) print(classifier.classify(gender_features2('Neo'))) print(classifier.classify(gender_features2('Trinity'))) print(nltk.classify.accuracy(classifier, devtest_set)) print(classifier.show_most_informative_features(5)) #Ispis krivih pretpostavki errors = [] for (name, tag) in devtest_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name)) for (tag, guess, name) in sorted(errors): print('correct={:<8} guess={:<8s} name={:<30}'.format( tag, guess, name))
def email_test(self, email): ham = './data/ham/ham.txt' spam = './data/spam/spam.txt' with open(ham, 'r') as infile: ham_sample = infile.read() print(ham_sample) print('-----------------------') with open(spam, 'r') as infile: spam_sample = infile.read() print(spam_sample) cv = CountVectorizer(stop_words="english", max_features=500) emails, labels = [], [] file_path = './data//ham/' for filename in glob.glob(os.path.join(file_path, '*.txt')): with open(filename, 'r', encoding= 'ISO-8859-1') as infile: emails.append(infile.read()) labels.append(0) file_path = './data/spam/' for filename in glob.glob(os.path.join(file_path, '*.txt')): with open(filename, 'r', encoding= 'ISO-8859-1') as infile: emails.append(infile.read()) labels.append(1) self.all_names = set(names.words()) self.lemmatizer = WordNetLemmatizer() cleaned_emails = self.clean_text(emails) term_docs = cv.fit_transform(cleaned_emails) print(term_docs[0]) feature_mapping = cv.vocabulary feature_names = cv.get_feature_names() feature_names[:5] label_index = self.get_label_index(labels) prior = self.get_prior(label_index) smoothing = 1 likelihood = self.get_likelihood(term_docs, label_index, smoothing) cleaned_test = self.clean_text(email) term_docs_test = cv.transform(cleaned_test) posterior = self.get_posterior(term_docs_test, prior, likelihood) print(posterior)
def _remove_names(self): """Remove names present in NLTK's names corpus.""" name_set = set(names.words()) no_names = {key: count for (key, count) in self._tokens.items() if key not in name_set} # logging num_removed = len(self._tokens) - len(no_names) _logger.info(('{} name tokens removed').format(num_removed)) self._tokens = collections.Counter(no_names)
def main(): cleaned: list = [] cv = CountVectorizer(stop_words="english", max_features=500) groups = fetch_20newsgroups() all_names = np.unique(names.words()) # set(names.words()) lemmatizer = WordNetLemmatizer() for post in groups.data: cleaned.append(" ".join([ lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word not in all_names ])) transformed = cv.fit_transform(cleaned) print(cv.get_feature_names())
def generate_blacklist_roles(): firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv", verbose=False)["Name"] surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv", verbose=False)["name"] surenames = surenames.apply(lambda n: n.title()) sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", column_type_hints={"characters": list}, na_values=["\\N"]) sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering", "characters", "nconst"] sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"]) sf = sf.stack("characters", "character") sf["character"] = sf["character"].apply(lambda c: c.title()) sf.export_csv(f"{TEMP_PATH}/roles3.csv") whitelist = sf.groupby(key_column_names=['character', "nconst"], operations={'count': agg.COUNT()}) whitelist = whitelist[whitelist["count"] > 1]['character'] sf = sf.filter_by(whitelist, "character", True) sf = sf.groupby(key_column_names=['character'], operations={ 'ordering': agg.AVG("ordering"), 'count': agg.COUNT() }) sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip()) sf = sf.filter_by(names.words(), "name", exclude=True) sf = sf.filter_by(surenames, "name", exclude=True) sf = sf.filter_by(firstnames, "name", exclude=True) sf = sf.sort("count", False) sf = sf[sf['ordering'] > 3] w = {x.replace("_", " ").title() for x in wordnet.words()} - set(names.words()) sf["set"] = sf["character"].apply(lambda x: x.split(" ")) sf["set"] = sf["set"].apply(lambda x: w & set(x)) sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10) & (sf["set"] != [])]) sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def lecture(): groups = fetch_20newsgroups() # print(groups.keys()) # print(groups['target_names']) # print('Here is the group target:', groups.target) # print(np.unique(groups.target)) # print(groups.data[0]) # print(groups.target[0]) # print(groups.target_names[groups.target[0]]) cv = CountVectorizer(stop_words="english", max_features=500) bag_of_words = cv.fit_transform(groups.data) print(bag_of_words) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) print(words_freq) for word, count in words_freq: print(word + ":", count) words = [] freqs = [] for word, count in words_freq: words.append(word) freqs.append(count) # Plot frequency plt.bar(np.arange(10), freqs[:10], align='center') plt.xticks(np.arange(10), words[:10]) plt.ylabel('Frequency') plt.title("Top 10 Words") plt.show() # Test if token is a word def letters_only(astr): return astr.isalpha() # Remove names from words and perform word lemmatization cleaned = [] all_names = set(x.lower() for x in names.words()) lemmatizer = WordNetLemmatizer() for post in groups.data[:250]: cleaned.extend(list(lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word.lower() not in all_names)) cleaned_bag_of_words = cv.fit_transform(cleaned) print(cv.get_feature_names()) transformed = cv.fit_transform(cleaned) nmf = NMF(n_components=100, random_state=43).fit(transformed) for topic_idx, topic in enumerate(nmf.components_): label = '{}: '.format(topic_idx) print(label, " ".join([cv.get_feature_names()[i] for i in topic.argsort()[:-9:-1]]))
def gender_classifier(first_name): #if True: nltk.download('names') # separate male and femal names male_names = [n for n in names.words('male.txt')] female_names = [n for n in names.words('female.txt')] #create a list of tuples with the name and the gender labeled_names = ([(name.lower(), 'male') for name in male_names] + [(name.lower(), 'female') for name in female_names]) # randomly shuffle the names np.random.seed(44) random.shuffle(labeled_names) X, y = list(zip(*labeled_names)) # create a tuple gender_sets = [(gender_identification(f_name), gender) for (f_name, gender) in labeled_names] np.random.seed(44) #split the list into even parts train_set, test_set = gender_sets[:int(len(gender_sets) * .7)], gender_sets[ int(len(gender_sets) * .7):] # build a classifier classifier = nltk.NaiveBayesClassifier.train(train_set) #print('Accuracy on unseen data: ', nltk.classify.accuracy(classifier, test_set)) pred_accuracy = nltk.classify.accuracy(classifier, test_set) pred = classifier.classify(gender_identification(first_name)) # try on a new name and accuracy of the model) #print('\nThe model estimates that ', first_name, 'is a', pred, 'name') return pred
def gender_match(tree, pos, pro): """ Takes a proposed antecedent and pronoun and checks whether they match in gender. Only checks for mismatches between singular proper name antecedents and singular pronouns. """ male_names = (name.lower() for name in names.words('male.txt')) female_names = (name.lower() for name in names.words('female.txt')) male_pronouns = ["he", "him", "himself"] female_pronouns = ["she", "her", "herself"] neuter_pronouns = ["it", "itself"] for c in tree[pos]: if isinstance(c, nltk.Tree) and c.label() in nominal_labels: # If the proposed antecedent is a recognized male name, # but the pronoun being resolved is either female or # neuter, they don't match if c.leaves()[0].lower() in male_names: if pro in female_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a recognized female name, # but the pronoun being resolved is either male or # neuter, they don't match elif c.leaves()[0].lower() in female_names: if pro in male_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a numeral, but the # pronoun being resolved is not neuter, they don't match elif c.leaves()[0].isdigit(): if pro in male_pronouns: return False elif pro in female_pronouns: return False return True