def no_backoff_taggers(test, train, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers without backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # unigram tagger uni_tagger = UnigramTagger(train) # bigram tagger bi_tagger = BigramTagger(train) # trigram tagger tri_tagger = TrigramTagger(train) info(uni_tagger) uni_score = uni_tagger.evaluate(test) print('accuracy score: {}\n'.format(uni_score)) info(bi_tagger) bi_score = bi_tagger.evaluate(test) print('accuracy score: {}\n'.format(bi_score)) info(tri_tagger) tri_score = tri_tagger.evaluate(test) print('accuracy score: {}\n'.format(tri_score))
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags) result = baseline_tagger.evaluate(test_set) return result
def backoff_taggers(test, train, save, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers with backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # UNIGRAM TAGGER WITH BACKOFF uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger) # BIGRAM TAGGER WITH BACKOFF bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff) # TRIGRAM TAGGER WITH BACKOFF tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff) info(uni_tagger_backoff) uni_backoff_score = uni_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(uni_backoff_score)) info(bi_tagger_backoff) bi_backoff_score = bi_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(bi_backoff_score)) info(tri_tagger_backoff) tri_backoff_score = tri_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(tri_backoff_score)) if not save: return accuracy_dict = {} accuracy_dict['uni'] = uni_backoff_score accuracy_dict['bi'] = bi_backoff_score accuracy_dict['tri'] = tri_backoff_score # Saving our Trigram-tagger with backoff if uni_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(uni_tagger_backoff, output, -1) elif bi_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(bi_tagger_backoff, output, -1) elif tri_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus) dump(tri_tagger_backoff, output, -1) output.close() info('saving %s...\n', tagger_file)
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def lookup_tag(num_sampling): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' #Get the frequency distribution of the words fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(num_sampling) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) tagged = lookup_tagger.tag(word_tokenize(raw)) print(tagged) score = lookup_tagger.evaluate(brown_tagged_sents) print(score)
# Compare tags assigned by the Unigram-Tagger and the tags assigned by the current NLTK standard tagger on a single sentence: # In[12]: mySent2="This is major tom calling ground control from space".split() print("Unigram Tagger: \n",complete_tagger.tag(mySent2)) print("\nCurrent Tagger applied for NLTK pos_tag(): \n",nltk.pos_tag(mySent2,tagset='universal')) # The performance of the trained tagger is evaluated on the same corpus as applied for training. The performance measure is the rate of words that have been tagged correctly. # In[13]: print("Performance of complete Tagger: ",complete_tagger.evaluate(brown_tagged_sents)) # The rate of correctly taggged words is quite high. However, this method of evaluation is not valid, since the same corpus has been used for evaluation as for training. Therefore we split the corpus into a *training-part* and a *test-part*. The *UnigramTagger* is then trained with the *training-part* and evaluated with the disjoint *test-part*. # In[14]: size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = UnigramTagger(train_sents,backoff=DefaultTagger("NN")) print("Performance of Tagger with 90% Training and 10% Testdata: ",unigram_tagger.evaluate(test_sents)) # As expected the rate of correctly tagged words is lower, but this value is now a valid evaluation measure.
return [u"%s/%s" % (t, p) for t, p in sent.pos() if not t in ["-LRB-", "-RRB-"]] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <corpus>" % sys.argv[0] sys.exit(-1) # Prepare corpus tagged_sents = build_tagged_sents(sys.argv[1:]) random.shuffle(tagged_sents) tagged_train = tagged_sents[: len(tagged_sents) / 2] tagged_test = tagged_sents[len(tagged_sents) / 2 :] # Train unigram tagger print "Training unigram tagger..." unigram_tagger = UnigramTagger(tagged_train) print "\taccuracy: %f" % unigram_tagger.evaluate(tagged_test) # Train brill tagger print "Training Brill tagger..." templates = [ # Context tag in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # Context word in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 3)), # Closest tag ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <corpus>" % sys.argv[0] sys.exit(-1) # Prepare corpus tagged_sents = build_tagged_sents(sys.argv[1:]) random.shuffle(tagged_sents) tagged_train = tagged_sents[:len(tagged_sents) / 2] tagged_test = tagged_sents[len(tagged_sents) / 2:] # Train unigram tagger print "Training unigram tagger..." unigram_tagger = UnigramTagger(tagged_train) print "\taccuracy: %f" % unigram_tagger.evaluate(tagged_test) # Train brill tagger print "Training Brill tagger..." templates = [ # Context tag in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # Context word in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 3)), # Closest tag ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
test_tsents = tagged_sents[:100] # Pega todas sentenças até a centésima from nltk import DefaultTagger # Define um tagger padrão, que sempre etiquetará a palavra com "N" = "NOUM" = "SUBSTANTIVO", visto que é a tag que mais ocorre tagger0 = DefaultTagger("N") # Avalia a acurácia do POS-Tagger ao etiquetar as sentenças de TESTE tagger0.evaluate(test_tsents) from nltk import UnigramTagger # Define um tagger Unigram (falaremos mais sobre isso depois) # Este tagger aprende ao ver as sentenças etiquetadas na base de TREINAMENTO # Além disso, utiliza o DefaultTagger caso não saiba o que marcar tagger1 = UnigramTagger(train_tsents, backoff=tagger0) tagger1.evaluate(test_tsents) from nltk import BigramTagger # Define um tagger Bigram (falaremos mais sobre isso depois) tagger2 = BigramTagger(train_tsents, backoff=tagger1) tagger2.evaluate(test_tsents) # Existe ainda mais um POS-Tagger no NLTK, o TnT from nltk.tag import tnt tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_tsents) tnt_pos_tagger.evaluate(test_tsents) # Se deseja apenas realizar o POS-Tagging, e não avaliar tagger2.tag(tokenize.word_tokenize(texto, language='portuguese'))
(r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) rt.evaluate(test_data) ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) ut.evaluate(test_data) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) tree = parsetree(sentence) for sentence_tree in tree:
def performance(wordList): tagger = dict((word[0], cfd[word[0]].max()) for (word, freq) in wordList if len(cfd[word[0]])) if not len(tagger): return 0 baselineTagger = UnigramTagger(model=tagger, backoff=DefaultTagger("NN")) return baselineTagger.evaluate(taggedSents)
from nltk import UnigramTagger from nltk.corpus import treebank from tag_util import word_tag_model model = word_tag_model(treebank.words(), treebank.tagged_words()) tagger = UnigramTagger(model=model) test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
def createModel(self): model_name=None try: unigrams=self.buildUnigrams() N=len(self.corpusSents) toTraining=round(self.training_portion*N) #logging.info("Sentencias totales:" + str(N)) training=self.corpusSents[:toTraining] test=self.corpusSents[toTraining:] post_patterns=[] for regex,post in self.regex_list: try: regex=regex.decode('utf-8') except: pass post_patterns.append((regex,post)) for regex,post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'),post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger) bigramTagger= BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion!=1: score_ut=unigramTagger.evaluate(test) score_bt=bigramTagger.evaluate(test)-0.002 score_tt=trigramTagger.evaluate(test) score_nt=NTagger.evaluate(test) scores=[score_ut,score_bt,score_tt,score_nt] tagger_names=["uTagger","biTagger","triTagger","NTagger"] taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger] bestTagger_index= scores.index(max(scores)) best_msg=max(scores),tagger_names[bestTagger_index] fname=self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname+self.tagger_extension_file): fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file else: fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file model=taggers[bestTagger_index] f = open(fname,'wb') pickle.dump(model, f) f.close() print ("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name=fname except Exception,e: print "ERRPR EN POS TAGGER GENERATOR:",str(e) pdb.set_trace()
def treeSentenceToTuples(sent): """ :param sent: a Tree representing a sentence :type sent: nltk.tree.Tree """ return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <corpus>" % sys.argv[0] sys.exit(-1) training = [] testing = [] lineIdx = 0 for fname in sys.argv[1:]: fin = codecs.open(fname, "r", "utf-8") for line in fin: lineIdx += 1 t = Tree.parse(line) if lineIdx % 2 == 0: training.append( t.pos() ) else: testing.append( t.pos() ) fin.close() # Train tagger unigram_tagger = UnigramTagger(training) # Evaluate print "Accuracy: %f" % unigram_tagger.evaluate(testing)
regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'DET'), # articles (r'.*able$', 'ADJ'), # adjectives (r'.*ness$', 'NOUN'), # nouns formed from adjectives (r'.*ly$', 'ADV'), # adverbs (r'.*s$', 'NOUN'), # plural nouns (r'.*ing$', 'VERB'), # gerunds (r'.*ed$', 'VERB'), # past tense verbs (r'.*', 'NOUN') # nouns (default) ]) #Affix tagger at2 = AffixTagger(train, backoff=regexp_tagger) #Unigram tagger ut3 = UnigramTagger(train, backoff=at2) ut3.evaluate(test) # Ngram tagger ct3 = NgramTagger(3, train, backoff=ut3) google3.EnsureDir("tagged/") for i in range(0, 12): try: tokenFile = codecs.open(os.path.join("clean", "Tokens-%s.txt" % (i)), "r", encoding="utf-8") taggedFile = codecs.open(os.path.join("tagged", "Tagged-%s.txt" % (i)), "a", encoding="utf-8") tokenList = tokenFile.read().splitlines() # taggedTokens = nltk.pos_tag(tokenList, tagset='universal') taggedTokens = ct3.tag(tokenList)
def performance(cfd, wordlist): lt = dict((word[0], cfd[word[0]].max()) for word in wordlist) baseline_tagger = UnigramTagger(model=lt, backoff=DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def createModel(self): model_name = None try: unigrams = self.buildUnigrams() N = len(self.corpusSents) toTraining = round(self.training_portion * N) #logging.info("Sentencias totales:" + str(N)) training = self.corpusSents[:toTraining] test = self.corpusSents[toTraining:] post_patterns = [] for regex, post in self.regex_list: try: regex = regex.decode('utf-8') except: pass post_patterns.append((regex, post)) for regex, post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'), post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams + training, backoff=regexpTagger) bigramTagger = BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger = NgramTagger(self.max_ngrams, training, backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print( "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion != 1: score_ut = unigramTagger.evaluate(test) score_bt = bigramTagger.evaluate(test) - 0.002 score_tt = trigramTagger.evaluate(test) score_nt = NTagger.evaluate(test) scores = [score_ut, score_bt, score_tt, score_nt] tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"] taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger] bestTagger_index = scores.index(max(scores)) best_msg = max(scores), tagger_names[bestTagger_index] fname = self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname + self.tagger_extension_file): fname = fname + str(len(listdir( self.taggers_path))) + self.tagger_extension_file else: fname = self.taggers_path + tagger_names[ bestTagger_index] + self.tagger_extension_file model = taggers[bestTagger_index] f = open(fname, 'wb') pickle.dump(model, f) f.close() print("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name = fname except Exception, e: print "ERRPR EN POS TAGGER GENERATOR:", str(e) pdb.set_trace()
from nltk import DefaultTagger, UnigramTagger, BigramTagger from nltk.corpus import treebank train_set = treebank.tagged_sents()[:3000] test_set = treebank.tagged_sents()[3000:] bitagger = UnigramTagger(train_set) print(bitagger.evaluate(test_set)) # quanto è buono da 0 a 1? print(bitagger.tag("I love Alessia too much her since years".split()) ) # lo provo su una frase che non conosce nessuno # domanda: e se voglio utilizzare un custom train/test_set invece che quelli del treebank? # soluzione: devo crearmelo io e al solito splittare in train & test custom_set = [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], [('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('of', 'IN'), ('this', 'DT'), ('British', 'JJ'), ('industrial', 'JJ'), ('conglomerate', 'NN'), ('.', '.')],