def demo_pos_supervised(): from nltk.corpus import brown from sys import stdout print 'Loading data from Brown corpus...' tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb'] annul_nonmatching_tags(tagged_tokens, tag_set, '*') words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens) word_set.sort() tag_set.sort() print 'output alphabet', `word_set`[:50], '...' print 'state labels ', `tag_set`[:50], '...' print tag_set print 'Training HMM...' #print 'training data:' #print zip(words[1:], tags[1:]) trainer = HMMTrainer(tag_set, word_set) hmm = trainer.train_supervised(words[100:], tags[100:], lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print hmm print 'Testing...' for ws, ts in zip(words[:3], tags[:3]): print ws print 'HMM >>>' print hmm.best_path(ws) print 'CORRECT >>>' print ts print '-' * 60 count = correct = 0 for ws, ts in zip(words[:100], tags[:100]): print '.', stdout.flush() pts = hmm.best_path(ws) for t, pt in zip(ts, pts): count += 1 if t == pt: correct += 1 print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
def testLengthOfText(brownSection): items = brown.items(brownSection) textToken = brown.read(items[0]) for i in range(len(items)): print "\n\nWeightedTaggedTScoreModel\n" model = WeightedTaggedTScoreModel(textToken, SUBTOKENS='WORDS') TextGenerator(model).generateWords(('the','at'),100) textToken['WORDS'] = textToken['WORDS'] + brown.read(items[i+1])['WORDS']
def buildTrainTokens(self, texts=10): ''' Tokenize texts from the Brown Corpus. ''' from nltk.corpus import brown train_tokens = [] for item in brown.items()[:texts]: train_tokens.append(brown.read(item)) self.train_tokens = train_tokens
def main(): print "\n\nWeightedTaggedTScoreModel\n" items = brown.items('fiction: general') textToken = brown.read(items[0]) for item in items[1:]: textToken['WORDS'] = textToken['WORDS'] + brown.read(item)['WORDS'] model = WeightedTaggedTScoreModel(textToken, SUBTOKENS='WORDS') TextGenerator(model).generateWords(('the','at'), 200) print "\n\nWeightedTScoreModel...\n" model = WeightedTScoreModel(textToken, SUBTOKENS='WORDS') TextGenerator(model).generateWords('the',200)
def load_pos(): from nltk.corpus import brown from nltk.tagger import TaggedTokenizer tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] start_re = re.compile(r'[^-*+]*') for token in tagged_tokens: # the multi-output allows us to treat each word as a # tuple of features for sub_token in token['SUBTOKENS']: sequence.append(sub_token) # a feature for words as lower case features = [sub_token['TEXT'].lower()] #a feature for word suffixes of length 3 features.append(sub_token['TEXT'][-3:]) # a feature for the length of words features.append(len(sub_token['TEXT'])) # store the observation as a tuple of features sub_token['TEXT'] = tuple(features) m = start_re.match(sub_token['TAG']) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token['TAG'] = tag else: sub_token['TAG'] = '*' # split on the period tag if sub_token['TAG'] == '.': sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, 3
def load_pos(): from nltk.corpus import brown tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.read(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] symbols = {} start_re = re.compile(r'[^-*+]*') for token in tagged_tokens: for sub_token in token['WORDS']: sequence.append(sub_token) # make words lower case sub_token['TEXT'] = sub_token['TEXT'].lower() symbols[sub_token['TEXT']] = 1 m = start_re.match(sub_token['TAG']) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token['TAG'] = tag else: sub_token['TAG'] = '*' # split on the period tag if sub_token['TAG'] == '.': sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, symbols.keys()
def load_pos(): from nltk.corpus import brown tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.read(item)) tag_set = [ "'", "''", "(", ")", "*", ",", ".", ":", "--", "``", "abl", "abn", "abx", "ap", "ap$", "at", "be", "bed", "bedz", "beg", "bem", "ben", "ber", "bez", "cc", "cd", "cd$", "cs", "do", "dod", "doz", "dt", "dt$", "dti", "dts", "dtx", "ex", "fw", "hv", "hvd", "hvg", "hvn", "hvz", "in", "jj", "jjr", "jjs", "jjt", "md", "nn", "nn$", "nns", "nns$", "np", "np$", "nps", "nps$", "nr", "nr$", "od", "pn", "pn$", "pp$", "ppl", "ppls", "ppo", "pps", "ppss", "ql", "qlp", "rb", "rb$", "rbr", "rbt", "rp", "to", "uh", "vb", "vbd", "vbg", "vbn", "vbz", "wdt", "wp$", "wpo", "wps", "wql", "wrb", ] sequences = [] sequence = [] symbols = {} start_re = re.compile(r"[^-*+]*") for token in tagged_tokens: for sub_token in token["WORDS"]: sequence.append(sub_token) # make words lower case sub_token["TEXT"] = sub_token["TEXT"].lower() symbols[sub_token["TEXT"]] = 1 m = start_re.match(sub_token["TAG"]) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token["TAG"] = tag else: sub_token["TAG"] = "*" # split on the period tag if sub_token["TAG"] == ".": sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, symbols.keys()
def demo_pos_supervised(): from nltk.corpus import brown from sys import stdout print 'Loading data from Brown corpus...' tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] annul_nonmatching_tags(tagged_tokens, tag_set, '*') words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens) word_set.sort() tag_set.sort() print 'output alphabet', ` word_set ` [:50], '...' print 'state labels ', ` tag_set ` [:50], '...' print tag_set print 'Training HMM...' #print 'training data:' #print zip(words[1:], tags[1:]) trainer = HMMTrainer(tag_set, word_set) hmm = trainer.train_supervised( words[100:], tags[100:], lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print hmm print 'Testing...' for ws, ts in zip(words[:3], tags[:3]): print ws print 'HMM >>>' print hmm.best_path(ws) print 'CORRECT >>>' print ts print '-' * 60 count = correct = 0 for ws, ts in zip(words[:100], tags[:100]): print '.', stdout.flush() pts = hmm.best_path(ws) for t, pt in zip(ts, pts): count += 1 if t == pt: correct += 1 print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
# Felizemente, os atomos dos corpus utilizados para a tarefa de # aprendizado jah estao demarcados com as suas tags. Sendo assim, # estes corpus podem ser utilizados tambem para a avaliacao dos # algoritmos de tags. # # um exemplo simples from nltk.tokenizer import * from nltk.tagger import * from nltk.corpus import brown # fase de treinamento train_tokens = [] for item in brown.items()[:10]: train_tokens.append(brown.read(item)) mytagger = UnigramTagger(SUBTOKENS='WORDS') for tok in train_tokens: mytagger.train(tok) # utilizacao text_token = Token(TEXT="John saw the book on the table") WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token) mytagger.tag(text_token) print text_token # # depois de criado o mytagger podemos testa-lo
# # Anteriormente, nos tentamos inferir a tag de um atomo # baseado apenas no texto do atomo. # Agora vamos usar informacoes sobre o contexto de um atomo # (palavras que estao ao seu entorno) para inferir a sua tag. # # # Bigram Taggers: utiliza sempre a palavra mais proxima do atomo # Geralmente, a anterior # # N-gram Taggers: utiliza sempre a n palavra mais proxima do atomo # Geralmente, a n-palavra anterior. # from nltk.tokenizer import * from nltk.tagger import * from nltk.corpus import brown tagger = NthOrderTagger(3, SUBTOKENS='WORDS') # 3rd order tagger for item in brown.items()[:10]: tok = brown.read(item) tagger.train(tok) # utilizacao text_token = Token(TEXT="John saw the book on the table") WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token) tagger.tag(text_token) print text_token
def demo(num_files=20): """ A simple demonstration function for the C{Tagger} classes. It constructs a C{BackoffTagger} using a 2nd order C{NthOrderTagger}, a 1st order C{NthOrderTagger}, a 0th order C{NthOrderTagger}, and an C{DefaultTagger}. It trains and tests the tagger using the brown corpus. @type num_files: C{int} @param num_files: The number of files that should be used for training and for testing. Two thirds of these files will be used for training. All files are randomly selected (I{without} replacement) from the brown corpus. If C{num_files>=500}, then all 500 files will be used. @rtype: None """ from nltk.corpus import brown import sys, random num_files = max(min(num_files, 500), 3) # Get a randomly sorted list of files in the brown corpus. items = list(brown.items()) random.shuffle(items) # Tokenize the training files. print '='*75 sys.stdout.write('Reading training data'); sys.stdout.flush() train_tokens = [] num_words = 0 for item in items[:num_files*2/3]: sys.stdout.write('.'); sys.stdout.flush() train_tokens.append(brown.read(item)) num_words += len(train_tokens[-1]['WORDS']) print '\n Read in %d words for training' % num_words print 'Training taggers.' # Create a default tagger default_tagger = DefaultTagger('nn', SUBTOKENS='WORDS') print ' Training unigram tagger...' t0 = UnigramTagger(SUBTOKENS='WORDS') for tok in train_tokens: t0.train(tok) print ' Training bigram tagger...' t1 = NthOrderTagger(1, SUBTOKENS='WORDS') for tok in train_tokens: t1.train(tok) print ' Training trigram tagger...' t2 = NthOrderTagger(2, SUBTOKENS='WORDS') for tok in train_tokens: t2.train(tok) # Delete train_tokens, because it takes up lots of memory. del train_tokens # Tokenize the testing files test_tokens = [] num_words = 0 sys.stdout.write('Reading testing data'); sys.stdout.flush() for item in items[num_files*2/3:num_files]: sys.stdout.write('.'); sys.stdout.flush() test_tok = brown.read(item) num_words += len(test_tok['WORDS']) test_tokens.append(test_tok) print '\n Read in %d words for testing' % num_words # Run the taggers. For t0, t1, and t2, back-off to DefaultTagger. # This is especially important for t1 and t2, which count on # having known tags as contexts; if they get a context containing # None, then they will generate an output of None, and so all # words will get tagged a None. print '='*75 print 'Running the taggers on test data...' print ' Default (nn) tagger: ', sys.stdout.flush() _demo_tagger(test_tokens, default_tagger) print ' Unigram tagger: ', sys.stdout.flush() _demo_tagger(test_tokens, BackoffTagger([t0, default_tagger], SUBTOKENS='WORDS')) print ' Bigram tagger: ', sys.stdout.flush() _demo_tagger(test_tokens, BackoffTagger([t1, t0, default_tagger], SUBTOKENS='WORDS')) print ' Trigram tagger: ', sys.stdout.flush() trigram = BackoffTagger([t2, t1, t0, default_tagger], SUBTOKENS='WORDS') _demo_tagger(test_tokens, trigram) print '\nUsage statistics for the trigram tagger:\n' trigram.print_usage_stats() print '='*75