def createName():
	mynames = ([(name, 'male') for name in names.words('male.txt')] +
			[(name, 'female') for name in names.words('female.txt')])
	random.shuffle(mynames)
	firstname = str(mynames[0][0]).replace(' ','')
	
	return firstname
Example #2
0
File: svm.py Project: trunghlt/nltk
def demo():
    def gender_features(word):
        return {"last_letter": word[-1], "penultimate_letter": word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names

    import random

    names = [(name, "male") for name in names.words("male.txt")] + [
        (name, "female") for name in names.words("female.txt")
    ]
    import random

    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n, g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print "--- nltk.classify.svm demo ---"
    print "Number of training examples:", len(train_set)
    classifier = SvmClassifier.train(train_set)
    print "Total SVM dimensions:", len(classifier._svmfeatureindex)
    print "Label mapping:", classifier._labelmapping
    print "--- Processing an example instance ---"
    print "Reference instance:", names[0]
    print "NLTK-format features:\n    " + str(test_set[0])
    print "SVMlight-format features:\n    " + str(
        map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)
    )
    distr = classifier.prob_classify(test_set[0][0])
    print "Instance classification and confidence:", distr.max(), distr.prob(distr.max())
    print "--- Measuring classifier performance ---"
    print "Overall accuracy:", accuracy(classifier, test_set)
Example #3
0
  def __init__(self,logger=None,loglevel=logging.INFO):
    if(logger is None):
      self.logger=createLog(logname="subtitle",level=loglevel)
    else:
      self.logger=logger

    self.logger.info("\n-----------------")
    self.logger.info("Subtitle begin to init")
    self.logger.info("\n-----------------")
    self.files=[]

    self.lexicon=set()
    self.stem_lexicon=set()
    self.noUsed=set(["-","","'","“","—","”"])
    #self.suffix=set(["'s","'d","'ve"])
    self.newWords=None
    self.wordSet=None
    self.stem_newWords=None
    self.checkup=False
    self.punctuation=r".?\[\]!,\":%;()|^=+\/\\_`\*;.:><"
    #self.addPunctuation([',','!',';','.',':','>','<'])

    self.raw=""
    self.lexicon_path=None
    self.nameSet=set(names.words('male.txt')+names.words('female.txt'))
    pass
    def semanticClassify(self, s):
        """
        对分段进行语义分类,仅动词和名词具有语义标签,需要先进行POS标记
        Input: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')]
        Output: [('i', 'PRON', ' '), ('love', 'VERB', 'love.n.01'), ('you', 'PRON', ' ')]
        """
        classified_seg = []
        for seg in s:

            male_name = [w.lower() for w in names.words('male.txt')]
            female_name = [w.lower() for w in names.words('female.txt')]

            month = ['january', 'february', 'march', 'april', 'may', 'june',
                     'july', 'august', 'september', 'october', 'november', 'december']

            if seg[1] == 'NP':
                if seg[0] in male_name:
                    classified_seg.append((seg[0], seg[1], 'male_name'))
                elif seg[0] in female_name:
                    classified_seg.append((seg[0], seg[1], 'female_name'))
                elif seg[0] in month:
                    classified_seg.append((seg[0], seg[1], 'month'))
                else:
                    classified_seg.append((seg[0], seg[1], ' '))
            elif (seg[1] == 'VERB' or seg[1] == 'NOUN'):
                classified = wn.synsets(seg[0])
                if len(classified) > 0:
                    classified_seg.append(
                        (seg[0], seg[1], classified[0].name()))
                else:
                    classified_seg.append((seg[0], seg[1], ' '))
            else:
                classified_seg.append((seg[0], seg[1], ' '))
        return self.encodeutf8(classified_seg)
Example #5
0
def demo():

    def gender_features(word):
        return {'last_letter': word[-1], 'penultimate_letter': word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names
    
    
    import random
    names = ([(name, 'male') for name in names.words('male.txt')] +
             [(name, 'female') for name in names.words('female.txt')])
    import random
    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n,g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print '--- nltk.classify.svm demo ---'
    print 'Number of training examples:', len(train_set)
    classifier = SvmClassifier.train(train_set)
    print 'Total SVM dimensions:', len(classifier._svmfeatureindex)
    print 'Label mapping:', classifier._labelmapping
    print '--- Processing an example instance ---'
    print 'Reference instance:', names[0]
    print 'NLTK-format features:\n    ' + str(test_set[0])
    print 'SVMlight-format features:\n    ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))
    distr = classifier.prob_classify(test_set[0][0])
    print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max())
    print '--- Measuring classifier performance ---'
    print 'Overall accuracy:', accuracy(classifier, test_set)
def main():
    from nltk.corpus import names
    names = ([(name, 'male') for name in names.words('male.txt')] +
        [(name, 'female') for name in names.words('female.txt')])
    random.shuffle(names)
    train_names = names[1500:]
    devtest_names = names[500:1500]
    test_names = names[:500]

    train_set = [(gender_features(n), g) for (n,g) in train_names]
    devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]

    classifier = nltk.NaiveBayesClassifier.train(train_set)

    print classifier.classify(gender_features('Neo'))
    print classifier.classify(gender_features('Trinity'))
    print 'attila:', classifier.classify(gender_features('Attila'))
    print classifier.classify(gender_features('Bori'))
    print classifier.classify(gender_features('Gabi'))
    print 'andy:', classifier.classify(gender_features('Andy'))
    print 'dom:', classifier.classify(gender_features('Dom'))
    print 'monica:', classifier.classify(gender_features('Monica'))
    print 'donnie:', classifier.classify(gender_features('Donald'))

    print "accuracy:", nltk.classify.accuracy(classifier, devtest_set)
    print classifier.show_most_informative_features(5)

    errors = []
    for (name, tag) in devtest_names:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append((tag, guess, name))
    for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
        print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
Example #7
0
    def __calculateAgreement(self):
        if len(self.np) == 1:
            if self.np[0,0] in names.words('male.txt'): self.gender = 'male'
            elif self.np[0,0] in names.words('female.txt'): self.gender = 'female'

        if {'NNS', 'NNPS'}.intersection({b for (a, b) in self.np.pos()}) or {',','and'}.intersection(self.np.leaves()):
            self.number = {'plural'}
        else:
            self.number = {'singular'}
        if 'PRP' in self.np[0].label():
            if self.np[0,0].lower() in {'they', 'them', 'themselves', 'their'}: self.number = {'plural'}
            elif self.np[0,0].lower() in {'him', 'he', 'himself'}:
                self.gender = 'male'
                self.number = {'singular'}
            elif self.np[0,0].lower() in {'her', 'herself' , 'she'}:
                self.number = {'singular'}
                self.gender = 'female'
            elif self.np[0,0].lower() in {'it', 'itself'}: self.number = {'singular'}
            elif self.np[0,0].lower() in {'us', 'we', 'our', 'ourselves'}:
                self.number = {'plural'}
                self.person = 'first'
            elif self.np[0,0].lower() in {'I', 'me', 'my', 'myself'}:
                self.number = {'singular'}
                self.person = 'first'
            elif self.np[0,0].lower() in {'yourself'}:
                self.number = {'singular'}
                self.person = 'second'
            elif self.np[0,0].lower() in {'you', 'your'}:
                self.number = {'singular', 'plural'}
                self.person = 'second'
            elif self.np[0,0].lower() in {'yourselves'}:
                self.number = {'plural'}
                self.person = 'second'
Example #8
0
def new_naive_bayes_classifier():
    # Create feature set consisting of male and female names for training
    global CLASSIFIER_CACHE
    if CLASSIFIER_CACHE:
        return CLASSIFIER_CACHE
    else:
        male_word_seq = _new_training_set(
            'male', names.words('male.txt'), MALE_PRONOUN_SEQ)
        female_word_seq = _new_training_set(
            'female', names.words('female.txt'), FEMALE_PRONOUN_SEQ)
        neutral_pronoun_seq = _new_training_set(
            'neutral', NEUTRAL_PRONOUN_SEQ)
        excess_seq = _new_training_set(
            'excess', ABBREVIATION_SEQ, PREPOSITION_SEQ, string.punctuation,
            ('looking', 'is'),
        )

        featureset_seq = (
            (_gender_features(word), gender)
            for word, gender in chain(
                    male_word_seq,
                    female_word_seq,
                    neutral_pronoun_seq,
                    excess_seq,
            ))
        CLASSIFIER_CACHE = nltk.NaiveBayesClassifier.train(featureset_seq)

        return CLASSIFIER_CACHE
Example #9
0
def initGenderClassifier():
    """Initialize gender classifier"""
    from nltk.corpus import names
    names = ([(name, 'male') for name in names.words('male.txt')] +
              [(name, 'female') for name in names.words('female.txt')])
    featuresets = [(gender_features(n), g) for (n,g) in names]
    return nltk.NaiveBayesClassifier.train(featuresets)
Example #10
0
    def gender(word):
        """Method to determine the gender of given word by comparing it to name dictionaries.

        Args:
            word (str):  Word. (usually a name)

        Keyword Args:
            is_server (bool):   Is Dragonfire running as an API server?
            user_id (int):      User's ID.

        Returns:
            str:  Male or Female

        .. note::

            This method is a very naive and not very useful. So it will be deprecated in the future.

        """

        labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                         [(name, 'female')
                          for name in names.words('female.txt')])
        shuffle(labeled_names)
        featuresets = [(Classifier.gender_features(n), gender)
                       for (n, gender) in labeled_names]
        train_set = featuresets[500:]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        return classifier.classify(Classifier.gender_features(word))
Example #11
0
def make_classifier():
    from nltk.corpus import names

    training_names = [(name, 'male') for name in names.words('male.txt')] + \
                     [(name, 'female') for name in names.words('female.txt')]
    feature_sets = [(name_features(name), gender) for (name, gender) in training_names]
    classifier = nltk.NaiveBayesClassifier.train(feature_sets)
    return classifier
Example #12
0
def get_variations(s):
    base = s.split()
    variations = []
    for n in base:
        if n in names.words():
            variations.append(n)
        if s in names.words():
            variations.append(s)
    return variations
Example #13
0
	def create_featuresets(self):
		'''
		Create featuresets of name, gender based on the names corpora
		'''
		train_names = ([(name,'male') for name in names.words('male.txt')] +
				 [(name,'female') for name in names.words('female.txt')])

		random.shuffle(train_names)
		return [(self.gender_features(n), g) for (n,g) in train_names]
Example #14
0
def feature_nameList(word):
    if word in names.words('male.txt'):
        return 1
    elif word in names.words('female.txt'):
        return 1
    elif GeoText(word):
        return 1
    else:
        return 0
Example #15
0
def identify_gender3():
    import random
    from nltk.corpus import names

    names = ([(name, 'male') for name in names.words('male.txt')] +
             [(name, 'female') for name in names.words('female.txt')])
    random.shuffle(names)
    featuresets = [(gender_features3(n), g) for n, g in names]
    return classify(nltk.NaiveBayesClassifier, featuresets, 500)
Example #16
0
def nltkTest():
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                     [(name, 'female') for name in names.words('female.txt')])
    import random
    random.shuffle(labeled_names)
    featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
    train_set, test_set = featuresets[500:], featuresets[:500]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    val = classifier.classify(gender_features('Neo'))
    print val
 def __init__(self):
     super(Gender, self).__init__()        
     labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
     [(name, 'female') for name in names.words('female.txt')])
     random.shuffle(labeled_names)
     featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
     train_set = featuresets
     #print train_set[0]
     #Training data for the BayerClassifier
     self.classifier = nltk.NaiveBayesClassifier.train(train_set) 	
Example #18
0
def main():
	name = ([(n, 'male') for n in names.words('male.txt')] + [(n, 'female') for n in names.words('female.txt')])
	randomNames = random.shuffle(name)

	featuresets = [(genderFeature(n.lower()), g) for (n, g) in name]
	train_set, test_set = featuresets[500:], featuresets[:500]
	classifier = nltk.NaiveBayesClassifier.train(train_set)

	string = raw_input("Enter a name: ")
	print classifier.classify(genderFeature(string))
	print nltk.classify.accuracy(classifier, test_set)
    def classify(self, name):
        feats = self._nameFeatures(name)
        # print(name, feats)
        for male in names.words('male.txt'):
            if name == male:
                return 'M'
        for female in names.words('female.txt'):
            if name == female:
                return 'F'

        return self.classifier.classify(feats)
Example #20
0
def feature_nameList(word):
    for name_m in names.words('male.txt'):
        if word[0].decode('unicode-escape') == name_m.decode('unicode-escape'):
            return 1
    for name_m in names.words('female.txt'):
        if word[0].decode('unicode-escape') == name_m.decode('unicode-escape'):
            return 1
    if len(GeoText(word[0]).cities)>0 or len(GeoText(word[0]).countries)>0:
        return 1
    else:
        return 0
Example #21
0
def partial_names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    male_names = names.words('male.txt')
    female_names = names.words('female.txt')

    random.seed(654321)
    random.shuffle(male_names)
    random.shuffle(female_names)

    # Create a list of male names to be used as positive-labeled examples for training
    positive = map(features, male_names[:2000])

    # Create a list of male and female names to be used as unlabeled examples
    unlabeled = map(features, male_names[2000:2500] + female_names[:500])

    # Create a test set with correctly-labeled male and female names
    test = [(name, True) for name in male_names[2500:2750]] \
        + [(name, False) for name in female_names[500:750]]

    random.shuffle(test)

    # Train up a classifier.
    print 'Training classifier...'
    classifier = trainer(positive, unlabeled)

    # Run the classifier on the test data.
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(n),m) for (n,m) in test])
    print 'Accuracy: %6.4f' % acc

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n,m) in test]
        pdists = classifier.batch_prob_classify(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
        print
        print 'Unseen Names      P(Male)  P(Female)\n'+'-'*40
        for ((name, is_male), pdist) in zip(test, pdists)[:5]:
            if is_male == True:
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print fmt % (name, pdist.prob(True), pdist.prob(False))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier
def __determine_gender__(article, sentence, token, start_index, end_index, entity_type):
    """WORKS"""
    if entity_type == "PER":
        if token in PRONOUN_LIST:
            if token in ["he","his"]:
                return "male"
            elif token in ["she","her"]:
                return "female"
        elif token.startswith("Mr.") or token.split("_")[0] in names.words("male.txt"):
            return "male"
        elif token.startswith("Mrs.") or token.split("_")[0] in names.words("female.txt"):
            return "female"
    return "unknown"
    def getFeatures(self):
        maleNames = (name for name in names.words('male.txt'))
        femaleNames = (name for name in names.words('female.txt'))

        featureset = list()
        for name in maleNames:
            features = self._nameFeatures(name)
            featureset.append((features, 'M'))

        for name in femaleNames:
            features = self._nameFeatures(name)
            featureset.append((features, 'F'))

        return featureset
    def __init__(self, corpus, outfile, tokens_dir, parses_dir, depparses_dir,
                 train=False):
        self.relations = list()
        self.train = train
        self.corpus = corpus
        self.outfile = outfile
        self.tokenized_sents, self.tok_sents_pos = self.process_tokens_dir(tokens_dir)
        self.parses = self.process_parses_dir(parses_dir)
        self.depparses = self.process_dparses_dir(depparses_dir)
        self.clusterdict = self.make_cluster_dict('50mpaths2')
        self.pronouns = ["I", "me", "my", "mine", "myself", "you", "your", "yours", "yourself",
                        "he", "him", "his", "his", "himself", "she", "her", "hers", "herself", 
                        "it", "its", "itself", "we", "us", "our", "ours", "ourselves", "you", "your", 
                        "yours", "yourselves", "they", "them", "their", "theirs", "themselves"]

        self.locations = set([c.lower() for c in gazetteers.words('countries.txt')] + 
                             [s.lower() for s in gazetteers.words('usstates.txt')])
        self.names = set([name.lower() for name in names.words('male.txt')] +
                 [name.lower() for name in names.words('female.txt')])

        self.feat_fns = [self.words,    #good
                         self.word_types, #good
                         self.pronoun, #good
                         self.name, #good
                         #self.place, #look to get a better list
                         self.num_words_between, #good
                         self.words_between_words, #good
                         self.prev_word, #good
                         #self.post_word, #really bad feature
                         #self.prev_word_pos, #bad
                         self.post_word_pos, #good
                         self.first_word_after_w1, #good
                         self.words_between_POSs, #good 
                         #self.last_word_before_w2
                         self.w1clust, #good
                         self.w2clust, #good
                         self.tree_path,
                         #self.w1pref, #bad
                         #self.w1suf,
                         #self.w2pref,
                         #self.w2suf,
                         #self.w1bow,
                         #self.w2bow
                         self.et1dw1,
                         self.et2dw2,
                         self.h1dw1,
                         self.h2dw2
                         ]
Example #25
0
 def add_sample(self, sample):
     if not isinstance(sample, str):
         raise TypeError
     # Calling add_sample should replace existing sample.
     # To avoid appending new values onto existing lists:
     self.sample = sample
     self.misspelled_words = []
     self.tokenized_sample = []
     self.tagged_sample = {}
     sample = sample.replace('\n', " ")
     sample = sample.rstrip(" ")
     for char in punctuation.replace("'", ""):
         sample = sample.replace(char, "")
     tokens = word_tokenize(sample)
     for word in tokens:
         if word.lower() in words.words():
             self.tokenized_sample.append(word)
         elif word.capitalize() in names.words():
             continue
         elif "'" in word:
             self.tokenized_sample.append(word)
         elif LEMMATIZER.lemmatize(word.lower()) not in words.words():
             if STEMMER.stem(word.lower()) not in words.words():
                 self.misspelled_words.append(word)
         else:
             self.tokenized_sample.append(word)
     self.tagged_sample = pos_tag(tokens)
    def posTagging(self, s):
        """
        对一个分段进行POS标记
        input: ['i','love','you']
        output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')]
        """
        brown_tagged_sents = brown.tagged_sents(
            tagset='universal', categories='news')

        default_tagger = nltk.DefaultTagger('NN')

        month = [u'january', u'february', u'march', u'april', u'may', u'june',
                 u'july', u'august', u'september', u'october', u'november', u'december']

        np_words = [w.lower() for w in names.words()] + month
        np_tags = dict((word, 'NP') for word in np_words)
        np_tagger = nltk.UnigramTagger(
            model=np_tags, backoff=default_tagger)

        brown_unigram_tagger = nltk.UnigramTagger(
            brown_tagged_sents, backoff=np_tagger)
        brown_bigram_tagger = nltk.BigramTagger(
            brown_tagged_sents, backoff=brown_unigram_tagger)
        brown_trigram_tagger = nltk.TrigramTagger(
            brown_tagged_sents, backoff=brown_bigram_tagger)

        patterns = [(r'\bi\b', 'PRON')]
        regexp_tagger = nltk.RegexpTagger(
            patterns, backoff=brown_trigram_tagger)

        result = regexp_tagger.tag(s)
        return self.encodeutf8(result)
def get_hosts(year):
    '''Hosts is a list of one or more strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    file_name = 'gg%s.json' % year
    with open(file_name, 'r') as data:
        db = json.load(data)
    events = get_pn_vec_from_range_for_hosts(db2013)
    hosts = []
    host = 0
    for item in events[0].most_common(100):
        skip = False
        #print item
        if host > 1:
            break
        for i in item[0]:
            if i in IGNORE_WORDS:
                skip = True
                break
        if skip:
            continue
        if item[0][0] in map(lambda x: x.lower(), names.words()) or item[0][1] in map(lambda x: x.lower(), names.words()):
            hosts.append(' '.join(word for word in item[0]))
            host = host + 1
    return hosts
Example #28
0
def pre_filter(iter):
	nameswords = set([word.lower() for word in names.words()])
	def replace(s): return ' '.join(['he' if x in nameswords else x for x in s.split()])
	for i, line in enumerate(iter):
		if (i%100000==0) and str(mp.current_process().name.strip()) == "PoolWorker-1": 
			out_error("Processed " + str(i*mp.cpu_count()) + " lines.", False)
		yield [replace(c) for c in line]
Example #29
0
def naive_bayes_gender_classifier():
  from nltk.corpus import names
  names = ([(name, "male") for name in names.words("male.txt")] +
           [(name, "female") for name in names.words("female.txt")])
  random.shuffle(names)
#  featuresets = [(_gender_features(n), g) for (n,g) in names]
#  train_set, test_set = featuresets[500:], featuresets[:500]
  # advisable to stream the sets in for large data set.
  train_set = apply_features(_gender_features, names[500:])
  test_set = apply_features(_gender_features, names[:500])
  classifier = nltk.NaiveBayesClassifier.train(train_set)
  print "Neo is ", classifier.classify(_gender_features("Neo"))
  print "Trinity is", classifier.classify(_gender_features("Trinity"))
  # calculate the accuracy of the classifier
  print nltk.classify.accuracy(classifier, test_set)
  classifier.show_most_informative_features(5)
 def initClassifier(self):
     
     # Set the nltk_data path so that we load the logisland mebedded corpus
     nltk.data.path.insert(0, nltk_data_path)
     
     # Loading gender data
     print "Loading gender data..."
     labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
              [(name, 'female') for name in names.words('female.txt')])
     random.shuffle(labeled_names)
     print "Loaded " + str(len(labeled_names)) + " samples"
     
     # Train classifier  with data
     print "Training gender classifier..."
     featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
     self.classifier = nltk.classify.NaiveBayesClassifier.train(featuresets)
     print "Gender classifier trained"
Example #31
0
    def initClassifier(self):

        # Set the nltk_data path so that we load the logisland mebedded corpus
        nltk.data.path.insert(0, nltk_data_path)

        # Loading gender data
        print "Loading gender data..."
        labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                         [(name, 'female')
                          for name in names.words('female.txt')])
        random.shuffle(labeled_names)
        print "Loaded " + str(len(labeled_names)) + " samples"

        # Train classifier  with data
        print "Training gender classifier..."
        featuresets = [(gender_features(n), gender)
                       for (n, gender) in labeled_names]
        self.classifier = nltk.classify.NaiveBayesClassifier.train(featuresets)
        print "Gender classifier trained"
Example #32
0
def clean_test(docs):
    cleaned = []
    all_names = set(x.lower()
                    for x in names.words())  # for all the name in the corpus
    lemmatizer = WordNetLemmatizer()
    for doc in docs:
        cleaned.append(' '.join(
            lemmatizer.lemmatize(word.lower()) for word in doc.split()
            if letters_only(word) and word.lower() not in all_names))
        return cleaned
Example #33
0
def get_gender_predictions(name_input):
    """
	Get Prediction
	"""
    try:
        train_data = ([(name, 'male') for name in names.words('male.txt')] \
            + [(name, 'female') for name in names.words('female.txt')])
        mix_dataset = random.shuffle(train_data)
        featuresets = [(gender_features(n), gender)
                       for (n, gender) in train_data]
        # classifier = nltk.NaiveBayesClassifier.train(featuresets)
        total_record = len(train_data)
        train_set, test_set = featuresets[
            int(total_record * .99):], featuresets[:int(total_record * .01)]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        prediction = classifier.classify(gender_features(name_input))
    except:
        prediction = "NA"
    return prediction
Example #34
0
    def tag_speakers_list(self, speakers):

        matched_speakers = 0
        for value in speakers:
            regex = r'' + re.escape(value) + r''

            results = re.findall(regex, self.abstract)

            for result in results:
                self.tags.append((result, "speaker"))
                matched_speakers += 1

        #todo improve this
        if matched_speakers == 0:
            tagged = self.pos

            l = len(tagged)

            for index, word in enumerate(tagged):

                if index < (l - 1):
                    #todo make this better and use regex
                    if word[1] == u'NP' and tagged[index + 1][1] == u'NP':
                        self.tags.append(
                            ((word[0] + " " + tagged[index + 1][0]),
                             "speaker"))
                    elif word[1] == u'NP' and tagged[index + 1][1] == "unseen":
                        if tagged[index + 1][0] in names.words():
                            self.tags.append(
                                ((word[0] + " " + tagged[index + 1][0]),
                                 "speaker"))
                    elif word[1] == "unseen" and tagged[index + 1][1] == u'NP':
                        if word[0] in names.words():
                            self.tags.append(
                                ((word[0] + " " + tagged[index + 1][0]),
                                 "speaker"))
                    elif word[1] == "unseen" and tagged[index +
                                                        1][1] == "unseen":
                        if word[0] in names.words() and tagged[
                                index + 1][0] in names.words():
                            self.tags.append(
                                ((word[0] + " " + tagged[index + 1][0]),
                                 "speaker"))
Example #35
0
 def __init__(self):
     """initiates with the data and all available NB classifier in sklearn"""
     self.data = [(w.strip(), "M") for w in names.words("male.txt")
                  ] + [(w.strip(), "F") for w in names.words("female.txt")]
     self.clf1 = BernoulliNB()
     self.clf2 = CategoricalNB()
     self.clf3 = ComplementNB()
     self.clf4 = GaussianNB()
     self.clf5 = MultinomialNB()
     self.clf6 = DecisionTreeClassifier()
     self.clf7 = ExtraTreeClassifier()
     self.clf8 = SVC()
     self.clfList = [
         self.clf1, self.clf2, self.clf3, self.clf4, self.clf5, self.clf6,
         self.clf7, self.clf8
     ]
     self.FeatureFuncList = [
         self.F01, self.F02, self.F03, self.F04, self.F05, self.F06
     ]
Example #36
0
def get_people(tweets):
    men = dict()
    women = dict()
    for tweet in tweets:
        words = [nltk.word_tokenize(tweet)]
        tagged_words = [nltk.pos_tag(word) for word in words][0]
        for chunk in nltk.ne_chunk(tagged_words):
            if type(chunk) == nltk.tree.Tree:
                # Adele needs a last name but other than her its fine to look for first and last
                if chunk.label(
                ) == 'PERSON' and len(chunk) > 1 and len(chunk) < 3:
                    name = (' '.join([c[0] for c in chunk]))
                    first = name.split(' ', 1)[0]
                    if first in names.words('male.txt'):
                        men[name] = men.get(name, 0) + 1
                    if first in names.words('female.txt'):
                        women[name] = women.get(name, 0) + 1

    return men, women
Example #37
0
 def cleaned_text(email):
     all_names = set(x.lower() for x in names.words())
     lemmatizer = WordNetLemmatizer()
     cleaned = []
     for messages in email:
         cleaned.append(' '.join(
             lemmatizer.lemmatize(word.lower())
             for word in messages.split()
             if letters_only(word) and word.lower() not in all_names))
     return cleaned
    def gender_identify(self, word, isPrint):
        # featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
        # train_set, test_set = featuresets[500:], featuresets[:500]

        labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                         [(name, 'female')
                          for name in names.words('female.txt')])
        random.shuffle(labeled_names)

        train_set = apply_features(self.gender_features, labeled_names[500:])
        test_set = apply_features(self.gender_features, labeled_names[:500])

        classifier = nltk.NaiveBayesClassifier.train(train_set)

        if isPrint:
            print("gender recognise accuracy is " +
                  str(nltk.classify.accuracy(classifier, test_set)))

        return classifier.classify(self.gender_features(word))
Example #39
0
 def clean_text(self, docs):
     all_names = set(names.words())
     lemmatizer = WordNetLemmatizer()
     cleaned_docs = []
     for doc in docs:
         cleaned_docs.append(' '.join([
             lemmatizer.lemmatize(word.lower()) for word in doc.split()
             if self.letters_only(word) and word not in all_names
         ]))
     return cleaned_docs
def gender_Classfier():
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
    random.shuffle(labeled_names)
    featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
    # test_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
    train_set = featuresets[:]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    # print(nltk.classify.accuracy(classifier, test_set))
    # print(classifier.classify(gender_features('Neo')))
    return classifier
Example #41
0
 def __init__(self):
     self.train_path = "../data/train"
     self.dev_path = "../data/dev"
     self.beta = 0
     self.max_iter = 0
     # self.classifier = None
     self.dict_classifiers = {}
     self.locations = set(gazetteers.words())
     self.names = set(names.words())
     self.pos = None
     self.previous_labels = None
Example #42
0
def glossary_filter(g):
    if g.entry_list == []: return False
    if g.freq == 0 and (len(g.lemmas[0]) < 4 or
                        (g.pos_list == ['noun']
                         and g.lemmas[0] in names.words())):
        return False
    for pos in pos_ignore_list:
        if pos in g.pos_list: return False
    for pos in g.tokens_pos:
        if 'NNP' in pos: return False
    return True
def clean_text(docs):
    all_names = set(names.words())
    cl_doc = []
    lemm = WordNetLemmatizer()
    for doc in docs:
        cl_doc.append(" ".join([
            lemm.lemmatize(word.lower()) for word in doc.split()
            if letter_only(word) and word not in all_names
        ]))

    return cl_doc
Example #44
0
def clean_text(docs):
    all_names = set(names.words())
    lemmatizer = WordNetLemmatizer()

    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([
            lemmatizer.lemmatize(word.lower()) for word in doc.split()
            if word.isalpha() and word not in all_names
        ]))
    return cleaned_docs
def preprocessing(data):
    all_names = set(names.words())
    lemmatizer = WordNetLemmatizer()
    data_cleaned = []
    for doc in data:
        doc_cleaned = ' '.join(
            lemmatizer.lemmatize(word) for word in doc.split()
            if is_letter(word) and word not in all_names)
        data_cleaned.append(doc_cleaned)
    print('preprocessing complete')
    return data_cleaned
Example #46
0
def common_unigrams():
    ''' Gets the unique words in several corpora in a set.

    :returns: The unique words.
    '''
    sw = set(lower_all(brown.words()))
    sw = sw.union(set(lower_all(names.words())))
    sw = sw.union(set(lower_all(words.words())))
    sw = sw.union(set(lower_all(reuters.words())))

    return sw
Example #47
0
def clean_text(docs):
    cleaned_docs: list = []
    all_names = set(names.words())
    lemmatizer = WordNetLemmatizer()
    for doc in docs:
        cleaned_docs.append(" ".join([
            lemmatizer.lemmatize(word.lower()) for word in doc.split()
            if letters_only(word) and word not in all_names
        ]))

    return cleaned_docs
Example #48
0
class EmailAddress(object):
    word_list = words.words(fileids=['en'])
    name_list = names.words(fileids=['female.txt'])
    urls = []
    emails = []

    def __init__(self, address_count):
        for n in range((address_count // 3) + 1):
            self.urls.append(self.gen_url())
        for n in range(address_count):
            self.emails.append("@".join(
                [self.gen_username(),
                 random.choice(self.urls)]))
        return

    def __str__(self):
        all_emails = "\n".join(self.emails)
        return (all_emails)

    def __iter__(self):
        for email in self.emails:
            yield (email)

    def sample_emails(self, count):
        return (random.sample(self.emails, count))

    def weighted_choice(self, choices):
        """simple weighted selection from 'dict[k] = v' where v is int"""
        weight_total = sum(choices.values())
        rand_val = random.uniform(0, weight_total)

        test_val = 0
        for k in choices:
            if test_val + choices[k] >= rand_val:
                return (k)
            test_val += choices[k]

    def gen_url(self):
        """generates a single, random URL with dns.tld structure"""
        return (".".join([self.gen_dns(), self.gen_tld()]))

    def gen_dns(self):
        """generates a single, random word to create a fake dns entry"""
        dns = random.choice(self.word_list).lower()
        return (dns)

    def gen_tld(self):
        # original tlds = ['com', 'org', 'net', 'int', 'edu', 'gov', 'mil']
        tlds = {'com': 5, 'net': 2, 'org': 1}
        return (self.weighted_choice(tlds))

    def gen_username(self):
        return (random.choice(self.name_list).lower())
Example #49
0
def clean_text(docs):
    Lemmatizer = WordNetLemmatizer()
    all_words = set(names.words())
    cleaned_data = []

    for doc in docs:
        cleaned_data.append(' '.join([
            Lemmatizer.lemmatize(word.lower()) for word in doc.split()
            if letter_only(word) and word not in all_words
        ]))

    return cleaned_data
Example #50
0
def pre_filter(iter):
    nameswords = set([word.lower() for word in names.words()])

    def replace(s):
        return ' '.join(['he' if x in nameswords else x for x in s.split()])

    for i, line in enumerate(iter):
        if (i % 100000 == 0) and str(
                mp.current_process().name.strip()) == "PoolWorker-1":
            out_error("Processed " + str(i * mp.cpu_count()) + " lines.",
                      False)
        yield [replace(c) for c in line]
Example #51
0
 def frequency(self,text):
     sent=self.tokenize(text)
     string=""
     for i in sent:
         if i not in stopwords.words('english')+names.words()+Li+CC+Li1:
             las=LancasterStemmer()
             temp=las.stem(i)
             lemma = nltk.wordnet.WordNetLemmatizer()
             lemma.lemmatize(temp)
             string+=str(temp+" ")
                 
     return string
def _predict_gender_init():
    try:
        male_pth = os.path.abspath('male_nms.txt')
        female_pth = os.path.abspath('female_nms.txt')
    except OSError:
        male_pth = os.path.abspath('male.txt')
        female_pth = os.path.abspath('female.txt')

    labeled_names = ([(name.lower(), 'male')
                      for name in names.words(male_pth)] +
                     [(name.lower(), 'female')
                      for name in names.words(female_pth)])

    random.shuffle(labeled_names)

    # we use the feature extractor to process the names data.
    train_set = [(gender_features_2(n), gender)
                 for (n, gender) in labeled_names]

    # The training set is used to train a new "naive Bayes" classifier.
    return nltk.NaiveBayesClassifier.train(train_set)
def main():
    #Dodavanje imana
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                     [(name, 'female') for name in names.words('female.txt')])
    random.shuffle(labeled_names)

    #Raspodjela imena 1
    featuresets = [(gender_features(n), gender)
                   for (n, gender) in labeled_names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    #Raspodjela imena 2
    train_names = labeled_names[1500:]
    devtest_names = labeled_names[500:1500]
    test_names = labeled_names[:500]
    train_set = [(gender_features(n), gender) for (n, gender) in train_names]
    devtest_set = [(gender_features(n), gender)
                   for (n, gender) in devtest_names]
    test_set = [(gender_features(n), gender) for (n, gender) in test_names]

    #Treniranje klasifikatora
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    #Ispis Rezultata
    print(gender_features2("John"))
    print(classifier.classify(gender_features2('Neo')))
    print(classifier.classify(gender_features2('Trinity')))
    print(nltk.classify.accuracy(classifier, devtest_set))
    print(classifier.show_most_informative_features(5))

    #Ispis krivih pretpostavki
    errors = []
    for (name, tag) in devtest_names:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append((tag, guess, name))

    for (tag, guess, name) in sorted(errors):
        print('correct={:<8} guess={:<8s} name={:<30}'.format(
            tag, guess, name))
    def email_test(self, email):


        ham = './data/ham/ham.txt'
        spam = './data/spam/spam.txt'
        with open(ham, 'r') as infile:
            ham_sample = infile.read()
        print(ham_sample)
        print('-----------------------')
        with open(spam, 'r') as infile:
            spam_sample = infile.read()
        print(spam_sample)
        cv = CountVectorizer(stop_words="english", max_features=500)
        emails, labels = [], []
        file_path = './data//ham/'
        for filename in glob.glob(os.path.join(file_path, '*.txt')):
            with open(filename, 'r', encoding= 'ISO-8859-1') as infile:
                emails.append(infile.read())
                labels.append(0)
        file_path = './data/spam/'
        for filename in glob.glob(os.path.join(file_path, '*.txt')):
            with open(filename, 'r', encoding= 'ISO-8859-1') as infile:
                emails.append(infile.read())
                labels.append(1)


        self.all_names = set(names.words())
        self.lemmatizer = WordNetLemmatizer()


        cleaned_emails = self.clean_text(emails)
        term_docs = cv.fit_transform(cleaned_emails)
        print(term_docs[0])
        feature_mapping = cv.vocabulary
        feature_names = cv.get_feature_names()


        feature_names[:5]




        label_index = self.get_label_index(labels)
        prior = self.get_prior(label_index)
        smoothing = 1
        likelihood = self.get_likelihood(term_docs, label_index, smoothing)


        cleaned_test = self.clean_text(email)
        term_docs_test = cv.transform(cleaned_test)
        posterior = self.get_posterior(term_docs_test, prior, likelihood)
        print(posterior)
    def _remove_names(self):
        """Remove names present in NLTK's names corpus."""

        name_set = set(names.words())

        no_names = {key: count for (key, count) in self._tokens.items()
                    if key not in name_set}

        # logging
        num_removed = len(self._tokens) - len(no_names)
        _logger.info(('{} name tokens removed').format(num_removed))

        self._tokens = collections.Counter(no_names)
Example #56
0
def main():
    cleaned: list = []
    cv = CountVectorizer(stop_words="english", max_features=500)
    groups = fetch_20newsgroups()
    all_names = np.unique(names.words())  # set(names.words())
    lemmatizer = WordNetLemmatizer()
    for post in groups.data:
        cleaned.append(" ".join([
            lemmatizer.lemmatize(word.lower()) for word in post.split()
            if letters_only(word) and word not in all_names
        ]))
    transformed = cv.fit_transform(cleaned)
    print(cv.get_feature_names())
Example #57
0
def generate_blacklist_roles():
    firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv",
                                 verbose=False)["Name"]
    surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv",
                                verbose=False)["name"]
    surenames = surenames.apply(lambda n: n.title())
    sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz",
                         delimiter="\t",
                         column_type_hints={"characters": list},
                         na_values=["\\N"])
    sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering",
                                                        "characters", "nconst"]
    sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"])
    sf = sf.stack("characters", "character")
    sf["character"] = sf["character"].apply(lambda c: c.title())
    sf.export_csv(f"{TEMP_PATH}/roles3.csv")

    whitelist = sf.groupby(key_column_names=['character', "nconst"],
                           operations={'count': agg.COUNT()})
    whitelist = whitelist[whitelist["count"] > 1]['character']
    sf = sf.filter_by(whitelist, "character", True)
    sf = sf.groupby(key_column_names=['character'],
                    operations={
                        'ordering': agg.AVG("ordering"),
                        'count': agg.COUNT()
                    })
    sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip())
    sf = sf.filter_by(names.words(), "name", exclude=True)
    sf = sf.filter_by(surenames, "name", exclude=True)
    sf = sf.filter_by(firstnames, "name", exclude=True)
    sf = sf.sort("count", False)
    sf = sf[sf['ordering'] > 3]
    w = {x.replace("_", " ").title()
         for x in wordnet.words()} - set(names.words())
    sf["set"] = sf["character"].apply(lambda x: x.split(" "))
    sf["set"] = sf["set"].apply(lambda x: w & set(x))
    sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10)
                                        & (sf["set"] != [])])
    sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
Example #58
0
def lecture():
    groups = fetch_20newsgroups()
    # print(groups.keys())
    # print(groups['target_names'])
    # print('Here is the group target:', groups.target)
    # print(np.unique(groups.target))
    # print(groups.data[0])
    # print(groups.target[0])
    # print(groups.target_names[groups.target[0]])
    cv = CountVectorizer(stop_words="english", max_features=500)
    bag_of_words = cv.fit_transform(groups.data)
    print(bag_of_words)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    print(words_freq)
    for word, count in words_freq:
        print(word + ":", count)

    words = []
    freqs = []
    for word, count in words_freq:
        words.append(word)
        freqs.append(count)

    # Plot frequency
    plt.bar(np.arange(10), freqs[:10], align='center')
    plt.xticks(np.arange(10), words[:10])
    plt.ylabel('Frequency')
    plt.title("Top 10 Words")
    plt.show()

    # Test if token is a word
    def letters_only(astr):
        return astr.isalpha()

    # Remove names from words and perform word lemmatization
    cleaned = []
    all_names = set(x.lower() for x in names.words())
    lemmatizer = WordNetLemmatizer()
    for post in groups.data[:250]:
        cleaned.extend(list(lemmatizer.lemmatize(word.lower()) for word in post.split()
                            if letters_only(word) and word.lower() not in all_names))
    cleaned_bag_of_words = cv.fit_transform(cleaned)
    print(cv.get_feature_names())
    transformed = cv.fit_transform(cleaned)
    nmf = NMF(n_components=100, random_state=43).fit(transformed)

    for topic_idx, topic in enumerate(nmf.components_):
        label = '{}: '.format(topic_idx)
        print(label, " ".join([cv.get_feature_names()[i] for i in topic.argsort()[:-9:-1]]))
Example #59
0
def gender_classifier(first_name):
    #if True: nltk.download('names')

    # separate male and femal names
    male_names = [n for n in names.words('male.txt')]
    female_names = [n for n in names.words('female.txt')]

    #create a list of tuples with the name and the gender
    labeled_names = ([(name.lower(), 'male')
                      for name in male_names] + [(name.lower(), 'female')
                                                 for name in female_names])

    # randomly shuffle the names
    np.random.seed(44)
    random.shuffle(labeled_names)

    X, y = list(zip(*labeled_names))

    # create a tuple
    gender_sets = [(gender_identification(f_name), gender)
                   for (f_name, gender) in labeled_names]

    np.random.seed(44)

    #split the list into even parts
    train_set, test_set = gender_sets[:int(len(gender_sets) *
                                           .7)], gender_sets[
                                               int(len(gender_sets) * .7):]

    # build a classifier
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #print('Accuracy on unseen data: ', nltk.classify.accuracy(classifier, test_set))
    pred_accuracy = nltk.classify.accuracy(classifier, test_set)
    pred = classifier.classify(gender_identification(first_name))

    # try on a new name and accuracy of the model)
    #print('\nThe model estimates that ', first_name, 'is a', pred, 'name')
    return pred
Example #60
0
def gender_match(tree, pos, pro):
    """ Takes a proposed antecedent and pronoun and checks whether
    they match in gender. Only checks for mismatches between singular
    proper name antecedents and singular pronouns.
    """
    male_names = (name.lower() for name in names.words('male.txt'))
    female_names = (name.lower() for name in names.words('female.txt'))
    male_pronouns = ["he", "him", "himself"]
    female_pronouns = ["she", "her", "herself"]
    neuter_pronouns = ["it", "itself"]

    for c in tree[pos]:
        if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
            # If the proposed antecedent is a recognized male name,
            # but the pronoun being resolved is either female or
            # neuter, they don't match
            if c.leaves()[0].lower() in male_names:
                if pro in female_pronouns:
                    return False
                elif pro in neuter_pronouns:
                    return False
            # If the proposed antecedent is a recognized female name,
            # but the pronoun being resolved is either male or
            # neuter, they don't match
            elif c.leaves()[0].lower() in female_names:
                if pro in male_pronouns:
                    return False
                elif pro in neuter_pronouns:
                    return False
            # If the proposed antecedent is a numeral, but the
            # pronoun being resolved is not neuter, they don't match
            elif c.leaves()[0].isdigit():
                if pro in male_pronouns:
                    return False
                elif pro in female_pronouns:
                    return False

    return True