def load_pos(): from en.parser.nltk_lite.corpora import brown from itertools import islice sentences = list(islice(brown.tagged(), 100)) tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb'] sequences = [] sequence = [] symbols = set() start_re = re.compile(r'[^-*+]*') for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word m = start_re.match(tag) # cleanup the tag tag = m.group(0) if tag not in tag_set: tag = '*' sentence[i] = (word, tag) # store cleaned-up tagged token return sentences, tag_set, list(symbols)
def load_pos(): from en.parser.nltk_lite.corpora import brown from itertools import islice sentences = list(islice(brown.tagged(), 100)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] symbols = set() start_re = re.compile(r'[^-*+]*') for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word m = start_re.match(tag) # cleanup the tag tag = m.group(0) if tag not in tag_set: tag = '*' sentence[i] = (word, tag) # store cleaned-up tagged token return sentences, tag_set, list(symbols)
def demo(): """ Demonstrates how to use IndexConcordance and Aggregator. """ print "Reading Brown Corpus into memory..." corpus = list(brown.tagged(('a', 'j'))) print "Generating index..." ic = IndexConcordance(corpus) print "Showing all occurences of 'plasma' in the Brown Corpus..." ic.formatted(middleRegexp="^plasma/.*", verbose=True) print "Investigating the collocates of 'deal' and derivatives..." agg = Aggregator() agg.add( ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0, leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates") agg.add( ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1, rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates") agg.formatted(showFirstX=5, usePOS=False)
def demo(): from en.parser.nltk_lite.corpora import brown from itertools import islice from pprint import pprint pprint(list(islice(brown.raw('a'), 0, 5))) pprint(list(islice(brown.tagged('a'), 0, 5)))
def demo (): # load train corpus train_sents = list(islice(brown.tagged(), 500)) # create taggers tagger = MarshalNgram(3) #tagger.train(train_sents) #tagger.marshal("ngram.test") tagger.unmarshal("ngram.test") print tagger._model
def demo(): """ A simple demonstration function for the C{Tagger} classes. It constructs a backoff tagger using a trigram tagger, bigram tagger unigram tagger and a default tagger. It trains and tests the tagger using the Brown corpus. """ from en.parser.nltk_lite.corpora import brown import sys print 'Training taggers.' # Create a default tagger t0 = Default('nn') # t1a = Affix(length=-3, minlength=5, backoff=t0) # t1b = Unigram(cutoff=2, backoff=t1a) t1 = Unigram(cutoff=1, backoff=t0) t2 = Bigram(cutoff=1, backoff=t1) t3 = Trigram(backoff=t2) t1.train(brown.tagged('a'), verbose=True) t2.train(brown.tagged('a'), verbose=True) t3.train(brown.tagged('a'), verbose=True) # Tokenize the testing files test_tokens = [] num_words = 0 # Run the taggers. For t0, t1, and t2, back-off to the default tagger. # This is especially important for t1 and t2, which count on # having known tags as contexts; if they get a context containing # None, then they will generate an output of None, and so all # words will get tagged a None. print '='*75 print 'Running the taggers on test data...' print ' Default (nn) tagger: ', sys.stdout.flush() _demo_tagger(t0, brown.tagged('b')) print ' Unigram tagger: ', sys.stdout.flush() _demo_tagger(t1, list(brown.tagged('b'))[:1000]) print ' Bigram tagger: ', sys.stdout.flush() _demo_tagger(t2, list(brown.tagged('b'))[:1000]) print ' Trigram tagger: ', sys.stdout.flush() _demo_tagger(t3, list(brown.tagged('b'))[:1000])
def demo(): """ A simple demonstration function for the C{Tagger} classes. It constructs a backoff tagger using a trigram tagger, bigram tagger unigram tagger and a default tagger. It trains and tests the tagger using the Brown corpus. """ from en.parser.nltk_lite.corpora import brown import sys print('Training taggers.') # Create a default tagger t0 = Default('nn') # t1a = Affix(length=-3, minlength=5, backoff=t0) # t1b = Unigram(cutoff=2, backoff=t1a) t1 = Unigram(cutoff=1, backoff=t0) t2 = Bigram(cutoff=1, backoff=t1) t3 = Trigram(backoff=t2) t1.train(brown.tagged('a'), verbose=True) t2.train(brown.tagged('a'), verbose=True) t3.train(brown.tagged('a'), verbose=True) # Tokenize the testing files test_tokens = [] num_words = 0 # Run the taggers. For t0, t1, and t2, back-off to the default tagger. # This is especially important for t1 and t2, which count on # having known tags as contexts; if they get a context containing # None, then they will generate an output of None, and so all # words will get tagged a None. print('=' * 75) print('Running the taggers on test data...') print(' Default (nn) tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t0, brown.tagged('b')) print(' Unigram tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t1, list(brown.tagged('b'))[:1000]) print(' Bigram tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t2, list(brown.tagged('b'))[:1000]) print(' Trigram tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t3, list(brown.tagged('b'))[:1000])
def demo(): """ A simple demonstration function for the C{Tagger} classes. It constructs a backoff tagger using a trigram tagger, bigram tagger unigram tagger and a default tagger. It trains and tests the tagger using the Brown corpus. """ from en.parser.nltk_lite.corpora import brown from en.parser.nltk_lite import tag import sys print('Training taggers.') # Create a default tagger t0 = tag.Default('nn') t1 = tag.Unigram(cutoff=1, backoff=t0) t1.train(brown.tagged('a'), verbose=True) t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0) t2.train(brown.tagged('a'), verbose=True) t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0) # no training t4 = tag.Lookup({'the': 'dt'}, backoff=t0) test_tokens = [] num_words = 0 print('='*75) print('Running the taggers on test data...') print(' Default (nn) tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t0, brown.tagged('b')) print(' Unigram tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t1, list(brown.tagged('b'))[:1000]) print(' Affix tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t2, list(brown.tagged('b'))[:1000]) print(' Regexp tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t3, list(brown.tagged('b'))[:1000]) print(' Lookup tagger: ', end=' ') sys.stdout.flush() _demo_tagger(t4, list(brown.tagged('b'))[:1000])
def demo(): """ A simple demonstration function for the C{Tagger} classes. It constructs a backoff tagger using a trigram tagger, bigram tagger unigram tagger and a default tagger. It trains and tests the tagger using the Brown corpus. """ from en.parser.nltk_lite.corpora import brown from en.parser.nltk_lite import tag import sys print 'Training taggers.' # Create a default tagger t0 = tag.Default('nn') t1 = tag.Unigram(cutoff=1, backoff=t0) t1.train(brown.tagged('a'), verbose=True) t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0) t2.train(brown.tagged('a'), verbose=True) t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0) # no training t4 = tag.Lookup({'the': 'dt'}, backoff=t0) test_tokens = [] num_words = 0 print '='*75 print 'Running the taggers on test data...' print ' Default (nn) tagger: ', sys.stdout.flush() _demo_tagger(t0, brown.tagged('b')) print ' Unigram tagger: ', sys.stdout.flush() _demo_tagger(t1, list(brown.tagged('b'))[:1000]) print ' Affix tagger: ', sys.stdout.flush() _demo_tagger(t2, list(brown.tagged('b'))[:1000]) print ' Regexp tagger: ', sys.stdout.flush() _demo_tagger(t3, list(brown.tagged('b'))[:1000]) print ' Lookup tagger: ', sys.stdout.flush() _demo_tagger(t4, list(brown.tagged('b'))[:1000])
def demo(): """ Demonstrates how to use IndexConcordance and Aggregator. """ print "Reading Brown Corpus into memory..." corpus = list(brown.tagged(('a','j'))) print "Generating index..." ic = IndexConcordance(corpus) print "Showing all occurences of 'plasma' in the Brown Corpus..." ic.formatted(middleRegexp="^plasma/.*", verbose=True) print "Investigating the collocates of 'deal' and derivatives..." agg = Aggregator() agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0, leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates") agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1, rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates") agg.formatted(showFirstX=5, usePOS=False)