Example #1
0
   def __init__(self):
      '''initialize and train brill and naive bayes classifiers'''
     
      #TODO: Fix bug where it loads tagger from calling module dir
      if exists(file):
         input = open(file, 'rb')
         self.classifier = load(input)
         input.close()
         print 'Successfully loaded saved classifier'
         return

      self.bayes = NaiveBayesTagger()
      boundary = int(len(brown.tagged_sents())*0.8)
      train = brown.tagged_sents(simplify_tags=True)[:boundary]

      brill_trainer = FastBrillTaggerTrainer(initial_tagger = self.bayes,
                                             templates = templates,
                                             trace = 3,
                                             deterministic = True)
         
      self.classifier = brill_trainer.train(train, max_rules=10)
         
      print 'Saving Taggers to file: "pos_tagger.pickle"'
      output = open(file, 'wb')
      dump(self.classifier, output, 1)
      output.close()
Example #2
0
def main():
    # run Simple unigram tagger
    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]

    nn_tagger = nltk.DefaultTagger('NN')
    ut2 = nltk.UnigramTagger(brown_train, backoff=nn_tagger)
    simpleUnigramTagger = SimpleUnigramTagger(brown_train, backoff=nn_tagger)
    print 'Simple Unigram tagger accuracy: %4.1f%%' % ( 100.0 * simpleUnigramTagger.evaluate(brown_test))
    print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * ut2.evaluate(brown_test))

    # run affix tagger with entropy
    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[:int(0.8*len(brown_news_tagged))]
    rest = brown_news_tagged[int(0.8*len(brown_news_tagged)):]
    brown_development = rest[:int(0.5*len(rest))]
    brown_test = rest[int(0.5*len(rest)):]
    
    affix_tagger = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=2)
    nltk.AffixTagger._train = _train
    nltk.AffixTagger.H = _H
    optcutoff = optimize_parameter()
    print "the optimal cutoff param is: %d " % optcutoff 
    affix_tagger2 = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=optcutoff)

    print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * affix_tagger.evaluate(brown_test))
    print 'Unigram tagger accuracy with entropy: %4.1f%%' % ( 100.0 * affix_tagger2.evaluate(brown_test))
def demo(train_size=100, test_size=100, java_home="/usr/local/jdk1.5.0/", mallet_home="/usr/local/mallet-0.4"):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus):
        return [[(w, t[:2]) for (w, t) in sent] for sent in corpus]

    brown_train = strip(brown.tagged_sents(categories="news")[:train_size])
    brown_test = strip(brown.tagged_sents(categories="editorial")[:test_size])

    crf = MalletCRF.train(fd, brown_train, transduction_type="VITERBI")  #'/tmp/crf-model',
    sample_output = crf.tag([w for (w, t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print "\nAccuracy: %.1f%%" % (acc * 100)
    print "Sample output:"
    print textwrap.fill(
        " ".join("%s/%s" % w for w in sample_output), initial_indent="  ", subsequent_indent="  "
    ) + "\n"

    # Clean up
    print "Clean-up: deleting", crf.filename
    os.remove(crf.filename)

    return crf
Example #4
0
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
                               for sent in corpus]
    brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
    brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])

    crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
                          transduction_type='VITERBI')
    sample_output = crf.tag([w for (w,t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print('\nAccuracy: %.1f%%' % (acc*100))
    print('Sample output:')
    print(textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
                        initial_indent='  ', subsequent_indent='  ')+'\n')

    # Clean up
    print('Clean-up: deleting', crf.filename)
    os.remove(crf.filename)

    return crf
Example #5
0
def test_sentences(categories=[]):
	"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		start = int(TEST_PROPORTION * total) # use the last k sentences for test
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
	return sents
Example #6
0
def training_sentences(use=1.0, categories=[]):
	"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
	return sents
def exercise2():
    print
    print "Exercise 2:"
    brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
    brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
    trigram_tagger = nltk.TrigramTagger(brown_news_tagged_sents)
    brown_news_eval = trigram_tagger.evaluate(brown_news_tagged_sents)
    brown_lore_eval = trigram_tagger.evaluate(brown_lore_tagged_sents)
    print "Evaluation of the trigram tagger on 'News': %f " % brown_news_eval
    print "Evaluation of the trigram tagger on 'Lore': %f " % brown_lore_eval
    print
Example #8
0
def precisionRecall():

    def tag_list(tagged_sents):
        return [tag for sent in tagged_sents for (word, tag) in sent]

    def apply_tagger(tagger, corpus):
        return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

    gold = tag_list(brown.tagged_sents(categories='editorial'))
    test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
    cm = nltk.ConfusionMatrix(gold, test)
    print cm.pp(sort_by_count=True, show_percents=True, truncate=9) 
Example #9
0
   def evaluate(self):
      '''run tests on conll2000 and treebank data'''

      test = treebank.tagged_sents()[:100]
      treebank_result = (100*self.classifier.evaluate(test))

      test = conll2000.tagged_sents()[:100]
      conll2000_result = (100*self.classifier.evaluate(test))

      test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
      brown_result = (100*self.classifier.evaluate(test))

      return (treebank_result, conll2000_result, brown_result)
Example #10
0
def testSet():

    tagged_sents = list(brown.tagged_sents(categories='news'))
    random.shuffle(tagged_sents)
    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]

    file_ids = brown.fileids(categories='news')
    size = int(len(file_ids) * 0.1)
    train_set = brown.tagged_sents(file_ids[size:])
    test_set = brown.tagged_sents(file_ids[:size])

    train_set = brown.tagged_sents(categories='news')
    test_set = brown.tagged_sents(categories='fiction')
Example #11
0
 def get_tagged_tokens(self, corpus=TAGGED, testing=False):
     """This tokenizes, segments, and tags all the files in a directory."""
     if testing:
         # train against a smaller version of the corpus so that it
         # doesn't take years during testing.
         tagger = build_trainer(brown.tagged_sents(categories='news'))
     else:
         tagger = build_trainer(brown.tagged_sents())
     tokens_and_spans = self.tokenize_corpus(corpus)
     tagged_spanned_tokens = tag_token_spans(
         tokens_and_spans,
         tagger,
     )
     return tagged_spanned_tokens
def exercise1():
    print
    print "Exercise 1:"
    brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
    brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
    unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents)
    brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents)
    brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents)
    print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval
    print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval
    brown_lore = bn.sents(categories = 'lore')
    b_lore = unigram_tagger.tag(brown_lore[200])
    print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: "
    print b_lore
    print
Example #13
0
   def __init__(self):

      boundary = int(len(brown.tagged_sents())*0.8)
      train_naive = brown.tagged_sents(simplify_tags=True)[:boundary] 
      temp_train_data = []
      for sentence in train_naive:
         untagged_sent = untag(sentence)
         history = []
         for i, (word, tag) in enumerate(sentence):
            temp_train_data.append((self.featextract(untagged_sent,
                                                      i,
                                                      history),
                                                      tag))
            history.append(tag)
      self.bayes=naivebayes.NaiveBayesClassifier.train(temp_train_data)
Example #14
0
def ch05_11_train_test_affix_tagger():
  from nltk.corpus import brown
  fd = nltk.FreqDist(brown.words(categories="news"))
  cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
  most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
  affix_tagger = nltk.AffixTagger(model=most_freq_pos)
  print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
Example #15
0
def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories="learned")
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))

    brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
    fd = nltk.FreqDist(tags)
    fd.tabulate()

    def process(sentence):
        for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
            if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
                print w1, w2, w3

    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)

    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, " ".join(tags)
Example #16
0
def getTaggerAndTestSetInSimplifiedMode(taggerName):
    brown_news_taggedS = brown.tagged_sents(categories='news', simplify_tags=True)
    brown_trainS = brown_news_taggedS[100:]
    brown_testS = brown_news_taggedS[:100]
    
    nn_taggerS = nltk.DefaultTagger('NN')
    regexp_taggerS = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_taggerS)
    at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS)
    ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S)
    ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S)
    if taggerName == "DefaultTagger":
        return nn_taggerS,brown_testS
    else:
        if taggerName == "RegExpTagger":
            return regexp_taggerS, brown_testS
        else:
            if taggerName == "AffixTagger":
                return at2S,brown_testS
            else:
                if taggerName == "UnigramTagger":
                    return ut3S,brown_testS
                else:
                    if taggerName == "BigramTagger":
                        return ct2S,brown_testS
Example #17
0
def main():
    nltk.TaggerI.evaluate2 = evaluate2
    
    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]
    
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'UNKNOWN')                 # unkonwn (default)
                                       ],backoff=None)
    at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(brown_train, backoff=at2)
    ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3)

    e = regexp_tagger.evaluate2(brown_test)
    print "evaluate2 regExp(default unknown) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1]
    e = at2.evaluate2(brown_test)
    print "evaluate2 affix(regExp(default unknown)) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1]
    e= ut3.evaluate2(brown_test)
    print "evaluate2 unigram(affix(regExp(default unknown))) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1]
    e= ct2.evaluate2(brown_test)
    print "evaluate2 bigram(unigram(affix(regExp(default unknown)))) = accoracy unkown words: %f ,accuracy known words: " %e[0],e[1]
    def posTagging(self, s):
        """
        对一个分段进行POS标记
        input: ['i','love','you']
        output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')]
        """
        brown_tagged_sents = brown.tagged_sents(
            tagset='universal', categories='news')

        default_tagger = nltk.DefaultTagger('NN')

        month = [u'january', u'february', u'march', u'april', u'may', u'june',
                 u'july', u'august', u'september', u'october', u'november', u'december']

        np_words = [w.lower() for w in names.words()] + month
        np_tags = dict((word, 'NP') for word in np_words)
        np_tagger = nltk.UnigramTagger(
            model=np_tags, backoff=default_tagger)

        brown_unigram_tagger = nltk.UnigramTagger(
            brown_tagged_sents, backoff=np_tagger)
        brown_bigram_tagger = nltk.BigramTagger(
            brown_tagged_sents, backoff=brown_unigram_tagger)
        brown_trigram_tagger = nltk.TrigramTagger(
            brown_tagged_sents, backoff=brown_bigram_tagger)

        patterns = [(r'\bi\b', 'PRON')]
        regexp_tagger = nltk.RegexpTagger(
            patterns, backoff=brown_trigram_tagger)

        result = regexp_tagger.tag(s)
        return self.encodeutf8(result)
Example #19
0
def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
Example #20
0
    def __init__(self):
        # This is our fast Part of Speech tagger
        brown_train = brown.tagged_sents(categories=['news'])
        regexp_tagger = nltk.RegexpTagger(
                [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                    (r'(-|:|;)$', ':'),
                    (r'\'*$', 'MD'),
                    (r'(The|the|A|a|An|an)$', 'AT'),
                    (r'.*able$', 'JJ'),
                    (r'^[A-Z].*$', 'NNP'),
                    (r'.*ness$', 'NN'),
                    (r'.*ly$', 'RB'),
                    (r'.*s$', 'NNS'),
                    (r'.*ing$', 'VBG'),
                    (r'.*ed$', 'VBD'),
                    (r'.*', 'NN')
                    ])
        self.unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
        self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=self.unigram_tagger)

        # This is our semi-CFG; Extend it according to your own needs
        cfg = {}
        cfg["NNP+NNP"] = "NNP"
        cfg["CD+CD"] = "CD"
        cfg["NN+NN"] = "NNI"
        cfg["NNI+NN"] = "NNI"
        cfg["JJ+JJ"] = "JJ"
        cfg["JJ+NN"] = "NNI"
        cfg["VBN+NNS"] = "NNP"
        self.cfg = cfg

        for i, word in enumerate(STOP_WORDS):
            STOP_WORDS[i] = word
Example #21
0
 def read_datas(self):
     brown_tagged_sentence  = brown.tagged_sents()
     brown_sent = brown.sents()
     size = int(len(brown_tagged_sentence) * 0.9)
     train_set =  brown_tagged_sentence[:size]
     test_set = brown_tagged_sentence[size:]
     return (train_set,test_set)
Example #22
0
def verbs():
    wsj = nltk.corpus.treebank.tagged_words(simplify_tags=True)
    # word_tag_fd = nltk.FreqDist(wsj)
    # print [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('V')]
    cfd1 = nltk.ConditionalFreqDist(wsj)
    print cfd1['yield'].keys()
    print cfd1['cut'].keys()
    print [w for w in cfd1.conditions() if 'VD' in cfd1[w] and 'VN' in cfd1[w]]
    idx1 = wsj.index(('kicked', 'VD'))
    print wsj[idx1-4:idx1+1]
    idx2 = wsj.index(('kicked', 'VN'))
    print wsj[idx2-4:idx2+1]

def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                   if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())

def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')) :
            print w1, w2, w3


if __name__ == "__main__":
    tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
    for tag in sorted(tagdict):
        print tag, tagdict[tag]
    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)
Example #23
0
def create_tagger():
    """Train a tagger from the Brown Corpus. This should not be called very
    often; only in the event that the tagger pickle wasn't found."""
    print "Building tagger..."
    train_sents = brown.tagged_sents()

    # These regexes were lifted from the NLTK book tagger chapter.
    t0 = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adjectives
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', 'NN')                    # nouns (default)
        ])
    print "got t0"

    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    print "got t1"

    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print "got t2"

    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    print "Built tagger!"
    return t3
Example #24
0
File: NERDb.py Project: jamt/IMDBot
 def __init__(self):
   try:
     tagger = cPickle.load(open('nerdb_tagger.pkl'))
   except IOError:
     print 'failed to load nerdb_tagger, recreating...'
     train_sents = conll2000.tagged_sents() + brown.tagged_sents()
     tagger = nltk.DefaultTagger('NN')
     tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
     tagger = nltk.BigramTagger(train_sents, backoff=tagger)
     tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
     cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w'))
     print 'done'
   try:
     chunker = cPickle.load(open('nerdb_chunker.pkl'))
   except IOError:
     print 'failed to load nerdb_chunker, recreating...'
     train_sents = conll2000.chunked_sents()
     chunker = ConsecutiveNPChunker(tagger, train_sents)
     cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w'))
     print 'done'
   self.chunker = chunker
   self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()]
   self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()]
   self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()]
   self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
Example #25
0
def auto_tag(company):
    """
    tag a given text using brown corpus and unigram tagger
    :param company: company whose reviews are tagged
    :return: a list of tagged words
    """
    brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
    brown_sents = brown.sents(categories = 'news')

    # open the review of a company, and print error message if company review doesn't exist
    # first deal with unique cases such as General Motors => GM
    if company == 'General Motors':
        company = 'GM'
    elif company == 'Ford Motor Company':
        company = 'Ford'
    try:
        text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
    except FileNotFoundError:
        print('The system doesn\'t have a review for the company you entered. Please enter another company.')

    # normalize (tokenize and lowercase-ize) each word in the string
    text_token = nltk.word_tokenize(text)
    text_normal = [w.lower() for w in text_token]

    # build unigram tagger based on brown corpus, and use it to tag the normalized text
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    text_tagged = unigram_tagger.tag(text_normal)
    return text_tagged
def get_medium_tagged_sentence_tuples(MIN=3, MAX=4):
    '''
    THIS FUNCTION IS NEVER USED
    Produces ~36 POS tags
    '''
    return [[(wrd, simplify(tag)) for wrd, tag in sent[:-1]] for sent in brown.tagged_sents()
                    if MIN < len(sent) <= MAX and sent[-1][0] == '.']
Example #27
0
 def __init__(self):
     try:
         tagger = cPickle.load(open("nerdb_tagger.pkl"))
     except IOError:
         print "failed to load nerdb_tagger, recreating..."
         train_sents = conll2000.tagged_sents() + brown.tagged_sents()
         tagger = nltk.DefaultTagger("NN")
         tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
         tagger = nltk.BigramTagger(train_sents, backoff=tagger)
         tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
         cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
         print "done"
     try:
         chunker = cPickle.load(open("nerdb_chunker.pkl"))
     except IOError:
         print "failed to load nerdb_chunker, recreating..."
         train_sents = conll2000.chunked_sents()
         chunker = ConsecutiveNPChunker(tagger, train_sents)
         cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
         print "done"
     self.chunker = chunker
     self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
     self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
     self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
     self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
     self.numbers = eval(open("numbers.txt").read())
Example #28
0
    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN'),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger
Example #29
0
   def _setSelectedPOSTags(self):

      buff = self._loadData('selective_pos.bin')

      if buff:
         self.selective_pos = buff
         return

      #First get all (word, tag) in corpuses
      sentences = brown.tagged_sents(simplify_tags=True)
      self.selected_tags = ["ADJ","ADV", "CNJ"]
      self.selective_pos = ConditionalFreqDist()
      temp_dist = ConditionalFreqDist()
      for sentence in sentences:
         for (word, tag) in sentence:
            if tag in self.selected_tags:
               temp_dist[tag].inc(str(word).lower())

      #Now, get the words with frequency > 10
      for category in temp_dist.conditions():
         fredist = temp_dist[category]
         for key in fredist.keys():
            if fredist[key] > 4:
               self.selective_pos[category].inc(key)

      self._saveData('selective_pos.bin',self.selective_pos)
Example #30
0
    def __init__(self):
        """Initialization method of :class:`TopicExtractor` class.
        """

        # This is our fast Part of Speech tagger
        #############################################################################
        brown_train = brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                (r'(-|:|;)$', ':'),
                (r'\'*$', 'MD'),
                (r'(The|the|A|a|An|an)$', 'AT'),
                (r'.*able$', 'JJ'),
                (r'^[A-Z].*$', 'NNP'),
                (r'.*ness$', 'NN'),
                (r'.*ly$', 'RB'),
                (r'.*s$', 'NNS'),
                (r'.*ing$', 'VBG'),
                (r'.*ed$', 'VBD'),
                (r'.*', 'NN')
            ])
        unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
        self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
        #############################################################################

        # This is our semi-CFG; Extend it according to your own needs
        #############################################################################
        self.cfg = {}
        self.cfg["NNP+NNP"] = "NNP"
        self.cfg["NN+NN"] = "NNI"
        self.cfg["NNI+NN"] = "NNI"
        self.cfg["JJ+JJ"] = "JJ"
        self.cfg["JJ+NN"] = "NNI"
Example #31
0
def main():
    brown_sentences = brown.tagged_sents(tagset='universal')
    train_sentences = []
    train_tags = []
    for sentence in brown_sentences:
        sentence_words = []
        sentence_tags = []
        for index, (word, pos) in enumerate(sentence):
            sentence_words.append(word)
            sentence_tags.append(pos)
        train_sentences.append(sentence_words)
        train_tags.append(sentence_tags)
    vocabulary_dict = {}
    for sentence in train_sentences:
        for word in sentence:
            if (vocabulary_dict.get(word) is None):
                vocabulary_dict[word] = 1
            else:
                vocabulary_dict[word] = vocabulary_dict[word] + 1
    for (word, count) in vocabulary_dict.items():
        if (count < 5):
            rare_words.add(word)
    training_features = [[]]

    for sentence_index, sentence in enumerate(train_sentences):
        training_features.append([])
        for word_index, word in enumerate(sentence):
            if (word_index == 0):
                prevtag = '<S>'
            else:
                prevtag = train_tags[sentence_index][word_index - 1]
            training_features[sentence_index].append(
                get_features(word_index, sentence, prevtag, rare_words))

    training_features, non_rare_features = remove_rare_features(
        training_features, 5)
    counter = 0
    for feature in non_rare_features:
        feature_dict[feature] = counter
        counter = counter + 1
    tagset = set()

    for sentence in train_tags:
        for tag in sentence:
            tagset.add(tag)

    counter = 0
    for tag in tagset:
        tag_dict[tag] = counter
        counter = counter + 1

    Y_train = build_Y(train_tags)
    X_train = build_X(training_features)
    model = LogisticRegression(class_weight='balanced',
                               solver='saga',
                               multi_class='multinomial')
    model.fit(X_train, Y_train)

    test_data = load_test("test.txt")
    for sentence in test_data:
        temp_data = []
        temp_data.append(sentence)
        Y_pred, Y_start = get_predictions(temp_data, model)
        print(viterbi(Y_start, Y_pred))
Example #32
0
fd.tabulate()
"""
often 后面最高频率的词性是动词,没有名词(该语料库中)
VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 
"""


# 使用 POS 标记寻找三词短语
def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if t1.startswith('V') and t2 == 'TO' and t3.startswith('V'):
            print(w1, w2, w3)


for tagged_sent in brown.tagged_sents():
    process(tagged_sent)

# 查看与它们的标记关系高度模糊不清的词
# 这些词各自的上下文可以帮助弄清楚标记之间的关系
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist(
    (word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
    if len(data[word]) > 3:
        tags = data[word].keys()
        print(word, ' '.join(tags))

# 打开 POS 一致性工具
nltk.app.concordance()
Example #33
0
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
        tagset="universal"
    ),
    "English: Brown Corpus": lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
        tagset="universal"
    ),
    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
        categories=["news", "editorial", "reviews"], tagset="universal"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
        categories="religion", tagset="universal"
    ),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
        categories="learned", tagset="universal"
    ),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="universal"
    ),
Example #34
0
# Ch5 分类和标注词汇
# 词性标注(parts-of-speech tagging,POS tagging):简称标注。
# 将词汇按照它们的词性(parts-of-speech,POS)进行分类并对它们进行标注
# 词性:也称为词类或者词汇范畴。
# 用于特定任务标记的集合被称为一个标记集。

import nltk
import pylab
from nltk import word_tokenize
from nltk.corpus import brown

brown_words = brown.words(categories='news')
brown_tagged_words = brown.tagged_words(categories='news')
brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')

# Sec 5.1 使用词性标注器
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)
nltk.help.upenn_tagset('CC')
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('IN')
nltk.help.upenn_tagset('NN')
nltk.help.upenn_tagset('JJ')
nltk.corpus.brown.readme()
print(nltk.corpus.gutenberg.readme())

# 处理同形同音异义词,系统正确标注了
# 前面的refUSE是动词,后面的REFuse是名词
# 前面的permit是动词,后面的permit是名字
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
Example #35
0
import nltk
from nltk.corpus import brown
from pprint import pprint
import pylab

brown_tagged_sents = brown.tagged_sents(categories='religion')
brown_sents = brown.sents(categories='religion')

# Create default tagger
tags = [tag for (word, tag) in brown.tagged_words(categories='religion')]
print(nltk.FreqDist(tags).max())

raw = 'The more I think about language, the more it amazes me that people ever understand each other at all'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
pprint(default_tagger.tag(tokens))

# Evaluate performance
print(default_tagger.evaluate(brown_tagged_sents))

# Regular Expression Tagger
patterns = [
     (r'.*ing$', 'VBG'),                # gerunds
     (r'.*ed$', 'VBD'),                 # simple past
     (r'.*es$', 'VBZ'),                 # 3rd singular present
     (r'.*ould$', 'MD'),                # modals
     (r'.*\'s$', 'NN$'),                # possessive nouns
     (r'.*s$', 'NNS'),                  # plural nouns
     (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                      # nouns (default)
 ]
Example #36
0
import nltk
from nltk.corpus import brown

data = brown.tagged_sents(categories=["adventure"], tagset="universal")

#This example to see how NaiveBayes with only one feature works. (The only feature is word by itself)
from NaiveBayes import NaiveBayesModel
model = NaiveBayesModel()
model.train(data)

first_sent = data[0]
first_sent_words = [w for w, l in first_sent]
prediction = model.predict(first_sent_words)
print(prediction)

# This example to see how Naive Bayes with different features works
# So it will have a feature extractor
from NaiveBayesUpdate import NaiveBayesModel_v2
import collections

model = NaiveBayesModel_v2()


def feature_extractor(word):
    feature_set = {}
    feature_set["word"] = word
    return feature_set


formated_data = []
for sent in data:
Example #37
0
import nltk
nltk.download('universal_tagset')
nltk.download('brown')
import numpy as np
import copy
import pandas as pd
import math
from tqdm import tqdm
from nltk.corpus import brown as corpus

tagged_words = []  #word with tag
all_tags = []  #tags sequence

my_set = {"START", "END"}
for sent in corpus.tagged_sents(tagset='universal'):  # get tagged sentences
    tagged_words.append(("START", "START"))
    all_tags.append("START")
    for (word, tag) in sent:
        all_tags.append(tag)
        tagged_words.append((tag, word))
        my_set.add(word)
    tagged_words.append(("END", "END"))
    all_tags.append("END")

# print(*map(' '.join,nltk.bigrams(all_tags)),sep=' ,')

utagset = [
    'START', 'VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM',
    'PRT', 'X', '.', 'END'
]
Example #38
0
from nltk.corpus import brown
import nltk
import pickle
import time
import json
import os

corpus=[]
for genre in brown.categories():
    corpus += brown.tagged_sents(categories=genre)       

tag = set()
for sen in corpus:
    for wordset in sen:
        tag.add(wordset[1])

f=open("brown_taglist.txt","w")
for each in tag:
    f.write(each+'\n')
f.close()

## create word_seg.txt and label_seg.txt
temp1 = open('C:\Users\a\Desktop\data\word_seg.txt','w',encoding='utf-8')
temp2 = open('C:\Users\a\Desktop\data\label_seg.txt','w',encoding='utf-8')

for sen in corpus:
    for wordset in sen:
        temp1.write(wordset[0]+' ')
        temp2.write(wordset[1]+' ')
    temp1.write('\n')
    temp2.write('\n')
Example #39
0
Implementation of a bigram HMM tagger
    i. Training phase: Compute the transition and emission probabilities of a bigram HMM tagger
        directly on the training set using maximum likelihood estimation.
    ii. Implement the Viterbi algorithm corresponding to the bigram HMM model in a way you can
        tag any test sentence.
    iii. Run the algorithm from c)ii) on the test set. Compute the error rate and compare it to the
        results from b)ii).
'''

import math
from nltk.corpus import brown

from tags_and_words import TAGS, TAG2INDEX, WORDS, WORDS2INDEX
from tags_and_words import START, STOP

data = brown.tagged_sents(categories="news")
train = data[:int(0.9 * len(data))]
test = data[int(0.9 * len(data)):]


def safe_log(x):
    if x == 0:
        return -float("inf")
    else:
        return math.log(x)


def add_start_and_stop(sent):
    return [(START, u"START")] + sent + [(STOP, u"STOP")]

cfd = ConditionalFreqDist()
# 得到英文停用词表
stopwords_list = stopwords.words('english')


# 定义一个函数,如果属于名词类则返回true
def is_noun(tag):
    return tag.lower() in [
        'nn', 'nns', 'nn$', 'nn-tl', 'nn+bez', 'nn+hvz', 'nns$', 'np', 'np$',
        'np+bez', 'nps', 'nps$', 'nr', 'np-tl', 'nrs', 'nr$'
    ]


...
# 统计前 5 个单词的出现次数
for sentence in brown.tagged_sents():
    for (index, tagtuple) in enumerate(sentence):
        (token, tag) = tagtuple
        token = token.lower()
        if token not in stopwords_list and is_noun(tag):
            window = sentence[index + 1:index + 5]
            for (window_token, window_tag) in window:
                window_token = window_token.lower()
                if window_token not in stopwords_list and is_noun(window_tag):
                    cfd[token].inc(window_token)
# 好了。我们完成了!让我们开始进行联想!
print(cfd['left'].max())
print(cfd['life'].max())
print(cfd['man'].max())
print(cfd['woman'].max())
print(cfd['boy'].max())
Example #41
0
    tags = generate_unique_tags()
    characters = generate_unique_characters()
    words, word_embedding = generate_words_embedding(word_embedding_path)

    # confusion matrix initialization
    confusion_matrix_train = torch.zeros(tags.id - 1,
                                         tags.id - 1,
                                         dtype=torch.int32,
                                         device=device)
    confusion_matrix_test = torch.zeros(tags.id - 1,
                                        tags.id - 1,
                                        dtype=torch.int32,
                                        device=device)

    # Partitioning the dataset
    corpus = np.array(brown.tagged_sents(tagset='universal'))
    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(corpus)

    # KFOLD Starts
    for train_index, test_index in kf.split(corpus):

        # dataset parsing
        train_corpus = corpus[train_index]
        test_corpus = corpus[test_index]
        word_sequences_train = [[word for (word, tag) in sent]
                                for sent in train_corpus]
        tag_sequences_train = [[
            tags.fetch(tag, 'val') for (word, tag) in sent
        ] for sent in train_corpus]
        word_sequences_test = [[word for (word, tag) in sent]
Example #42
0
def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt,
                                         backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
Example #43
0
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"

POLL_INTERVAL = 50

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)":
    lambda: cess_cat.tagged_sents(tagset="universal"),
    "English: Brown Corpus":
    lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)":
    lambda: brown.tagged_sents(tagset="universal"),
    "English: Brown Corpus (Press, simplified)":
    lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"],
                               tagset="universal"),
    "English: Brown Corpus (Religion, simplified)":
    lambda: brown.tagged_sents(categories="religion", tagset="universal"),
    "English: Brown Corpus (Learned, simplified)":
    lambda: brown.tagged_sents(categories="learned", tagset="universal"),
    "English: Brown Corpus (Science Fiction, simplified)":
    lambda: brown.tagged_sents(categories="science_fiction",
                               tagset="universal"),
    "English: Brown Corpus (Romance, simplified)":
    lambda: brown.tagged_sents(categories="romance", tagset="universal"),
    "English: Brown Corpus (Humor, simplified)":
Example #44
0
    if n=='All':
        return output_list
    else:
        return random.sample(output_list, min(n, len(output_list)))

def getReverseDict(inDict):
    outDict = nltk.defaultdict(list)
    for key in inDict.keys():
        outDict[inDict[key]].append(key)
    return outDict
##    for entry in inDict[key]:
##        outDict[entry]=key


# get brown_tagged, sentence form
brown_tagged_s = brown.tagged_sents()
# word form:
##brown_tagged_w = brown.tagged_words()

# get all tags used in brown tagging
b_tags = sorted(set(entry[1] for entry in brown.tagged_words()))

# build a freq dist the long way (but I alread have tags...)
tag_count = nltk.defaultdict(int)
handlecrapstr('for entry in b_tags: tag_count[entry]')
for sent in brown_tagged:
    for (w, t) in sent:
        tag_count[t] += 1
# the short way...
tag_fd = nltk.FreqDist(entry[1] for sent in brown_tagged for entry in sent)
Example #45
0
class NLP():

    #_grammar = nltk.data.load('file:' + PATH_GRAMMAR_FILE)
    _unigram_tagger = nltk.UnigramTagger(brown.tagged_sents(categories='news'))
    _question_tags = ["WDT", "WP$", "WPO", "WPS", "WQL", "WRB"]

    def __init__(self):
        self._input_types = [
            self.match_questions, self.match_commands, self.match_descrptions,
            self.match_explanations
        ]

    def generate(self, msg_type):
        answers = [
            production for production in generate(self._grammar,
                                                  start=Nonterminal(msg_type))
        ]
        return TreebankWordDetokenizer().detokenize(secrets.choice(answers))

    def run(self, sentence):
        tokens = [word for word in nltk.word_tokenize(sentence)]
        tagged_tokens = self._unigram_tagger.tag(tokens)
        print(tagged_tokens)
        for pattern in self._input_types:
            match = pattern(tagged_tokens)
            if match:
                return match(tagged_tokens)
        return False

    def match_questions(self, tagged_tokens):
        first_tag = tagged_tokens[0][1]
        if first_tag in self._question_tags:
            return self.process_questions
        return False

    def match_commands(self, tagged_tokens):
        first_tag = tagged_tokens[0][1]
        if first_tag == "VB" or first_tag == "VBD":
            return self.process_commands
        return False

    def match_descrptions(self, tagged_tokens):
        first_tag = tagged_tokens[0][1]
        if first_tag in ["PPSS", "EX", "CD"]:
            return self.process_descrptions
        return False

    def match_explanations(self, tagged_tokens):
        return False

    def process_questions(self, tagged_tokens):
        question = tagged_tokens[0][0]
        #object = list(filter(lambda x: x[1] == "NN", tagged_tokens))
        object = tagged_tokens[3][0]
        if question == "Where":
            return robot.find_location(object)

    def process_commands(self, tagged_tokens):
        action = tagged_tokens[0][0].encode('ascii', 'ignore')
        target = None
        for word, tag in tagged_tokens:
            if tag == "NN":
                target = word.encode('ascii', 'ignore')
        return ('cmd', Command(action, target))

    def process_descrptions(self, tagged_tokens):
        subject = tagged_tokens[0]
        if subject[0] == "I":
            tagged_verb = tagged_tokens[1][1]
            if tagged_verb == "BEM":
                return robot.delete_agent() if tagged_tokens[2][
                    0] == "leaving" else robot.new_interlocutor(
                        tagged_tokens[2][0])
            elif tagged_tokens[1][0] == "leave":
                return robot.delete_agent()
            elif "CD" in [x[1] for x in tagged_tokens]:
                step = [x[0] for x in tagged_tokens if x[1] == "CD"]
                robot.modify_task_plan(step[0])
            else:
                pass
        elif subject[0] == "There":
            object = tagged_tokens[3][0]
            attribute = tagged_tokens[4][0]
            location = tagged_tokens[6][0]
            return robot.add_item(object, attribute, location)
        elif subject[1] == "CD":
            if "done" in [x[0] for x in tagged_tokens]:
                return StepCompleted(subject[0])
        else:
            pass

    def process_explanations(self, tagged_tokens):
        pass
from nltk.corpus import brown
from HMM import HMM


tagged_corpus=brown.tagged_sents(categories=["adventure"],tagset="universal")

model=HMM()
model.fit(tagged_corpus)

x=["The","man","certainly","didn't","want","to","wait"]
pred=model.predict(x)
print(pred)
####### -> Seem work very well :O :O :O


####### Evaluate the performance of the TriHMM

eval_sents=brown.tagged_sents(categories=["romance"],tagset="universal")

x=[[ w for w,t in sent] for sent in eval_sents ]
y_actual=[[t for w,t in sent] for sent in eval_sents]

y_predict=model.predict_many(x)

correct=0
total=0
for iter,sent in enumerate(y_actual):
    for jter,val in enumerate(sent):
        if y_actual[iter][jter]==y_predict[iter][jter]
            correct+=1
        total+=1
Example #47
0
import random
from nltk.corpus import brown
import nltk
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
############################################

file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
############################################

train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))
Example #48
0
    features = {
        'Suffix(1)': sentence[i][-1:],
        'Suffix(2)': sentence[i][-2:],
        'Suffix(3)': sentence[i][-3:]
    }
    # 单词前面的词作为特征,如果单词为头一个单词就设置前面一个单词为<START>
    if i == 0:
        features['prev-word'] = '<START>'
    else:
        features['prev-word'] = sentence[i - 1]
    return features


# (brown.sents()[0],8) == 'investigation'
type(pos_features(brown.sents()[0], 8))
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent,
                                         i), tag))  # 特征集合中的元素必须是tuple的形式

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

# 1.6. 序列分类(通过上下文的标签提高分类的精度)
# 为了获取相关分类任务之间的依赖关系,可以使用联合分类器模型
# 连续分类或者贪婪序列分类的序列分类器策略
                    count = count + 1


# /////////////////////////////////////////////////////////////////////////////////////////////////
# //////////////////////////////////////////////MAIN//////////////////////////////////////////////

nltk.download('brown')
nltk.download('punkt')
nltk.download('universal_tagset')
one_hot_labels = None
vocab_v0 = {}
wordVocab = {}
tag_mapping = {}
Pre.parse_Questions(brown)
Pre.labelWordEncoder()
brown_tagsents = dllist(brown.tagged_sents(tagset='universal'))
xTrain, yTrain = Pre.dataSegmentation(brown_tagsents)

############################################              MODEL                    ########################################################################
# Data Partition
# Padding Vectorized Data Set
x_train = tf.keras.preprocessing.sequence.pad_sequences(xTrain[:50000],
                                                        padding='post')
y_train = tf.keras.preprocessing.sequence.pad_sequences(yTrain[:50000],
                                                        padding='post')

x_validation = x_train[40000:]
y_validation = y_train[40000:]
x_train = x_train[10000:40000]
y_train = y_train[10000:40000]
Example #50
0
def answers():
    global tagged_sentences_universal, test_data_universal, \
        train_data_universal, model, test_size, train_size, ttags, \
        correct, incorrect, accuracy, \
        good_tags, bad_tags, answer4b, answer5

    # Load the Brown corpus with the Universal tag set.
    tagged_sentences_universal = brown.tagged_sents(categories='news',
                                                    tagset='universal')

    # Divide corpus into train and test data.
    test_size = 500
    train_size = len(tagged_sentences_universal) - test_size  # fixme

    test_data_universal = tagged_sentences_universal[-test_size:]  # fixme
    train_data_universal = tagged_sentences_universal[:train_size]  # fixme

    if hashlib.md5(''.join(
            map(
                lambda x: x[0],
                train_data_universal[0] + train_data_universal[-1] +
                test_data_universal[0] + test_data_universal[-1])
    ).encode('utf-8')).hexdigest() != '164179b8e679e96b2d7ff7d360b75735':
        print(
            '!!!test/train split (%s/%s) incorrect, most of your answers will be wrong hereafter!!!'
            % (len(train_data_universal), len(test_data_universal)),
            file=sys.stderr)

    # Create instance of HMM class and initialise the training and test sets.
    model = HMM(train_data_universal, test_data_universal)

    # Train the HMM.
    model.train()

    # Some preliminary sanity checks
    # Use these as a model for other checks
    e_sample = model.elprob('VERB', 'is')
    if not (type(e_sample) == float and e_sample <= 0.0):
        print('elprob value (%s) must be a log probability' % e_sample,
              file=sys.stderr)

    t_sample = model.tlprob('VERB', 'VERB')
    if not (type(t_sample) == float and t_sample <= 0.0):
        print('tlprob value (%s) must be a log probability' % t_sample,
              file=sys.stderr)

    if not (type(model.states) == list and \
            len(model.states) > 0 and \
            type(model.states[0]) == str):
        print('model.states value (%s) must be a non-empty list of strings' %
              model.states,
              file=sys.stderr)

    print('states: %s\n' % model.states)

    ######
    # Try the model, and test its accuracy [won't do anything useful
    #  until you've filled in the tag method
    ######
    s = 'the cat in the hat came back'.split()
    model.initialise(s[0])
    ttags = model.tag(s)  # fixme
    print("Tagged a trial sentence:\n  %s" % list(zip(s, ttags)))

    v_sample = model.get_viterbi_value('VERB', 5)
    if not (type(v_sample) == float and 0.0 <= v_sample):
        print('viterbi value (%s) must be a cost' % v_sample, file=sys.stderr)

    b_sample = model.get_backpointer_value('VERB', 5)
    if not (type(b_sample) == str and b_sample in model.states):
        print('backpointer value (%s) must be a state name' % b_sample,
              file=sys.stderr)

    # check the model's accuracy (% correct) using the test set
    correct = 0
    incorrect = 0
    incorrent_sent = []

    for sentence in test_data_universal:
        s = [word.lower() for (word, tag) in sentence]
        model.initialise(s[0])
        tags = model.tag(s)
        inc = False
        for ((word, gold), tag) in zip(sentence, tags):
            if tag == gold:
                correct = correct + 1  # fix me
            else:
                incorrect = incorrect + 1  # fix me
                inc = True
        if inc and len(incorrent_sent) < 10:
            incorrent_sent.append((sentence, tags))

    print('\nFirst 10 incorrect sentences are:')
    for sent, tags in incorrent_sent:
        print("Tagged test sentence:")
        print(sent)
        print("\nTags produced:")
        print(tags)
        print('\n\n')
    accuracy = correct / (correct + incorrect)  # fix me
    print('Tagging accuracy for test set of %s sentences: %.4f' %
          (test_size, accuracy))

    # Print answers for 4b, 5 and 6
    bad_tags, good_tags, answer4b = answer_question4b()
    print('\nA tagged-by-your-model version of a sentence:')
    print(bad_tags)
    print('The tagged version of this sentence from the corpus:')
    print(good_tags)
    print('\nDiscussion of the difference:')
    print(answer4b[:280])
    answer5 = answer_question5()
    print('\nFor Q5:')
    print(answer5[:500])
    answer6 = answer_question6()
    print('\nFor Q6:')
    print(answer6[:500])
Example #51
0
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

from nltk import pos_tag, word_tokenize
from nltk.corpus import brown
from sklearn.model_selection import train_test_split

from nltk.corpus import brown
import nltk
nltk.download('brown')

# Define corpus

corpus = brown.tagged_sents()

training_vocab = list(set([word for sent in corpus for word, tag in sent]))


def pair_counts(X, Y):
    """Return a dictionary keyed to each unique value in the first sequence list
    that counts the number of occurrences of the corresponding value from the
    second sequences list.
    """
    words = [ii for i in X for ii in i if type(i) != str]
    tags = [ii for i in Y for ii in i if type(i) != str]

    pair_count = {tag: {} for tag in set(tags)}

    for tag, word in zip(tags, words):
Example #52
0
    try:
        (trainsection, testsection, method) = ('news', 'editorial', 'default')
        opts, args = getopt.getopt(sys.argv[1:], "hi:o:m:",
                                   ["help", "train=", "test=", "method="])
    except getopt.GetoptError:
        usage(sys.argv)
    for o, a in opts:
        if o in ('-h', '--help'):
            usage([sys.argv[0]])
            sys.exit(0)
        if o in ('-i', '--train'): trainsection = a
        if o in ('-o', '--test'): testsection = a
        if o in ('-m', '--method'): method = a

    train_tagged_sents = brown.tagged_sents(categories=trainsection)
    test_tagged_sents = brown.tagged_sents(categories=testsection)
    train_tagged_words = brown.tagged_words(categories=trainsection)
    test_tagged_words = brown.tagged_words(categories=testsection)
    train_words = brown.words(categories=trainsection)

    print_to_file("\n\nmethod = " + method + "\n")
    default_tag = default_tag(train_tagged_sents)
    default_tagger = nltk.DefaultTagger(default_tag)

    if method in ['unigram', 'bigram', 'trigram']:
        tu = nltk.UnigramTagger(train_tagged_sents, backoff=default_tagger)
        tb = nltk.BigramTagger(train_tagged_sents, backoff=tu)
        tt = nltk.TrigramTagger(train_tagged_sents, backoff=tb)

    fd = nltk.FreqDist(train_words)
        train_sets = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                train_sets.append((pos_feature_tag(untagged_sent, i,
                                                   history), tag))
                history.append(tag)
        self.classifier = nltk.classify.NaiveBayesClassifier.train(train_sets)

    def tag(self, sentence):
        history = []
        feature_set = []
        taggers = []
        for i, word in enumerate(sentence):
            feature_set = pos_feature_tag(sentence, i, history)
            tag = self.classifier.classify(feature_set)
            history.append(tag)
            taggers.append(tag)
        return zip(sentence, taggers)


if __name__ == '__main__':
    sentences = brown.tagged_sents(categories="news")
    # tagger=ConsecutivePosTagger(sentences)
    # print([(word,tag)for word,tag in tagger.tag(["i","am","a","gir","who","are","so","beautiful"])])
    size = int(len(sentences) * 0.1)
    train_set, test_set = sentences[size:], sentences[:size]
    tagger = ConsecutivePosTagger(train_set)
    print(tagger.evaluate(test_set))
Example #54
0
import nltk
#from nltk.book import *
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk import word_tokenize
from nltk import hmm
#nltk.help.upenn_tagset("NN*")
files = treebank.fileids()
#print(files)
t = treebank.tagged_words("wsj_0003.mrg")
#for p in t:
#print(p)

#race1 = nltk.tag.str2tuple('race/NN')
#race2 = nltk.tag.str2tuple('race/VB')
#print(race1)

#print(brown.tagged_words().count(race1))
#print(brown.tagged_words().count(race2))

unitag = nltk.tag.UnigramTagger(brown.tagged_sents(categories='news')[:5000])
print(unitag)
s = "The secretariat is expected to race tomorrow."
s_tok = word_tokenize(s)
tt = unitag.tag(s_tok)
print(tt)

hmmTagger = nltk.hmm.HiddenMarkovModelTrainer().train_supervised(
    brown.tagged_sents(categories="news")[:5000])
tt2 = hmmTagger.tag(s_tok)
print(tt2)
Example #55
0
BOUNDARY = r'\b'

CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'

# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.

_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
    'Catalan: CESS-CAT Corpus (simplified)':
    lambda: cess_cat.tagged_sents(simplify_tags=True),
    'English: Brown Corpus':
    lambda: brown.tagged_sents(),
    'English: Brown Corpus (simplified)':
    lambda: brown.tagged_sents(simplify_tags=True),
    'English: Brown Corpus (Press, simplified)':
    lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                               simplify_tags=True),
    'English: Brown Corpus (Religion, simplified)':
    lambda: brown.tagged_sents(categories='religion', simplify_tags=True),
    'English: Brown Corpus (Learned, simplified)':
    lambda: brown.tagged_sents(categories='learned', simplify_tags=True),
    'English: Brown Corpus (Science Fiction, simplified)':
    lambda: brown.tagged_sents(categories='science_fiction',
                               simplify_tags=True),
    'English: Brown Corpus (Romance, simplified)':
    lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
    'English: Brown Corpus (Humor, simplified)':
def main():
    brown_sentences = brown.tagged_sents(tagset='universal')

    train_sentences = []
    train_tags = []

    # sepearating sentences and their labels
    for sentence in brown_sentences:
        s = []
        t = []
        for pair in sentence:
            s.append(pair[0])
            t.append(pair[1])

        train_sentences.append(s)
        train_tags.append(t)

    # finding word count
    word_count = find_word_count(train_sentences)

    # finding rare words
    rare_words = find_rare_words(word_count, 5)

    # adding features for each word
    training_features = []
    for idx in range(len(train_sentences)):
        features = []
        sentence = train_sentences[idx]
        for i in range(len(sentence)):
            prevtag = '<S>' if i == 0 else train_tags[idx][i - 1]
            features.append(get_features(i, sentence, prevtag, rare_words))
        training_features.append(features)

    # overwriting training features with values after removing rare features
    training_features, non_rare_features = remove_rare_features(
        training_features, 5)

    # creating feature dictionary
    counter = 0
    for feature in non_rare_features:
        feature_dict[feature] = counter
        counter = counter + 1

    # creating tag dictionary
    tag_counter = 0
    for sent_tags in train_tags:
        for tag in sent_tags:
            if tag not in tag_dict:
                tag_dict[tag] = tag_counter
                tag_counter = tag_counter + 1

    X_train = build_X(training_features)
    Y_train = build_Y(train_tags)

    print "X_train Y_train built"
    '''
    # if we want to save model then use this code
    filename = 'lr_model.sav'
    
    lr = pickle.load(open(filename, 'rb'))
    if lr == None:
        lr = LogisticRegression(class_weight='balanced', solver='saga', multi_class='multinomial', verbose=2)
        lr.fit(X_train, Y_train)
        
        print "Model fit"
        
        # save the model to disk
        pickle.dump(lr, open(filename, 'wb'))
    
    '''

    lr = LogisticRegression(class_weight='balanced',
                            solver='saga',
                            multi_class='multinomial',
                            verbose=2)
    lr.fit(X_train, Y_train)

    print "Model fit"

    test_data = load_test('test.txt')
    # tag prediction
    for sentence in test_data:
        Y_pred, Y_start = get_predictions([sentence], lr)
        tags = viterbi(Y_start, Y_pred)
        print "sentence =", sentence
        print "tags=", tags
        print "\n"
Example #57
0
import nltk
from nltk.corpus import brown

brown_train = brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'(-|:|;)$', ':'),
     (r'\'*$', 'MD'),
     (r'(The|the|A|a|An|an)$', 'AT'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ness$', 'NN'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.*', 'NN')
     ])

unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
cfg = {
    "NNP+NNP": "NNP",
    "NN+NN": "NNI",
    "NNI+NN": "NNI",
    "JJ+JJ": "JJ",
    "JJ+NN": "NNI"
}


class NPExtractor(object):
# In[86]:

# Показать имена признаков
one_hot_multi.classes_

# In[43]:

# Загрузить библиотеку
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# Получить немного текста из стандартного текстового корпуса
# Brown Corpus, разбитого на предложения
sentences = brown.tagged_sents(categories='news')

# Выделить на 4000 предложений для тренировки и 623 для тестирования
train = sentences[:4000]
test = sentences[4000:]

# Создать разметчик с откатом
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

# Показать точность
trigram.evaluate(test)

# > <b>6.9 Кодирование текста в качестве мешка слов
Example #59
0
def init_nltk():
    global tokenizer
    global tagger
    tokenizer = tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
    tagger = UnigramTagger(brown.tagged_sents())
Example #60
0
import nltk
from nltk.corpus import brown
import numpy as np
from collections import Counter
from collections import defaultdict
from math import log
import time

stime = time.time()
sentences = np.array(brown.tagged_sents())
words = brown.tagged_words()
tokens, taged = zip(*words)

#
# firstdict = {}
# firstSum = len(sentences)
# for i in sentences:
#     x,y = i[0]
#     if y not in firstdict.keys():
#         firstdict[y] = 1
#     else:
#         firstdict[y] += 1
#
# for i in firstdict.keys():
#     firstdict[i] = firstdict[i]/firstSum

# total word count
total = len(words)

# preping corpus data
wordcount = Counter(tokens)