def crossValidate(corpus, test_precent): summarize = [] corpus_len = len(corpus) cut = int((test_precent / 100.0) * corpus_len) mean = 0 for i in range(0, corpus_len / cut): test = corpus[i * cut:cut * (i + 1)] train = corpus[:i * cut] + corpus[cut * (i + 1):] nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) accu = float(ct2.evaluate(test)) summarize.append((i, accu)) mean += accu return (summarize, mean / (corpus_len / cut))
def _build_tagger(): global tagger file = Path(tagger_path) if tagger != None: return if file.is_file(): tagger = object_io.read_object(tagger_path) else: print('{} - Building train data...'.format(datetime.now())) dataset = nltk.corpus.floresta.tagged_sents() + \ nltk.corpus.mac_morpho.tagged_sents() traindata = [[(w, _simplify_tag(t)) for (w, t) in sent] for sent in dataset] print('{} - Training POS tagging model...'.format(datetime.now())) tagger = nltk.NgramTagger( 4, traindata, backoff=nltk.TrigramTagger( traindata, backoff=nltk.BigramTagger( traindata, backoff=nltk.UnigramTagger( traindata, backoff=nltk.DefaultTagger('NOUN'))))) print('{} - Saving tagger object...'.format(datetime.now())) object_io.save_object(tagger, tagger_path)
def main(): nltk.TaggerI.ConfusionMatrix = ConfusionMatrix brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(brown_train, backoff=at2) ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3) print ct2.ConfusionMatrix(brown_test)
def getTrainedTagger(): train = brown.tagged_sents(simplify_tags=True) newTrain = [] for sen in train: newSen = [] for word, tag in sen: newSen.append((word.lower(), tag)) newTrain.append(newSen) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) at2 = nltk.AffixTagger(newTrain, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(newTrain, backoff=at2) ct2 = nltk.NgramTagger(2, newTrain, backoff=ut3) return ct2
def question5(): a = brown.tagged_sents(categories='news') train = a[:(int(len(a) * 0.9))] test = a[(int(len(a) * 0.9)):] n = 6 for i in range(n): x = nltk.NgramTagger(i, train) print(str(i + 1) + ': ' + str(x.evaluate(test)))
def trainTagger(pos_tagged): size = int(len(pos_tagged) * 0.9) train_sents = pos_tagged[:size] test_sents = pos_tagged[size:] tagger = nltk.UnigramTagger(train=pos_tagged, verbose=True, backoff=nltk.DefaultTagger('None')) tagger = nltk.BigramTagger(train=pos_tagged, verbose=True, backoff=tagger) tagger = nltk.NgramTagger(3, train=pos_tagged, verbose=True, backoff=tagger) tagger = nltk.NgramTagger(4, train=pos_tagged, verbose=True, backoff=tagger) print('(train={} , test={} , evaluate={})'.format( size, len(pos_tagged) - size, tagger.evaluate(test_sents))) return tagger
def getTagger(self): brown_news_tagged = brown.tagged_sents(categories='news') nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) affix_tagger = nltk.AffixTagger(brown_news_tagged, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(brown_news_tagged, backoff=affix_tagger) ct2 = nltk.NgramTagger(2, brown_news_tagged, backoff=ut3) return ct2
def getTaggerAndTestSetInSimplifiedMode(taggerName): brown_news_taggedS = brown.tagged_sents(categories='news', simplify_tags=True) brown_trainS = brown_news_taggedS[100:] brown_testS = brown_news_taggedS[:100] nn_taggerS = nltk.DefaultTagger('NN') regexp_taggerS = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_taggerS) at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS) ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S) ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S) if taggerName == "DefaultTagger": return nn_taggerS, brown_testS else: if taggerName == "RegExpTagger": return regexp_taggerS, brown_testS else: if taggerName == "AffixTagger": return at2S, brown_testS else: if taggerName == "UnigramTagger": return ut3S, brown_testS else: if taggerName == "BigramTagger": return ct2S, brown_testS
def main(): nltk.TaggerI.evaluate2 = evaluate2 brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'UNKNOWN') # unkonwn (default) ], backoff=None) at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(brown_train, backoff=at2) ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3) e = regexp_tagger.evaluate2(brown_test) print "evaluate2 regExp(default unknown) = accoracy unkown words: %f ,accuracy known words: " % e[ 0], e[1] e = at2.evaluate2(brown_test) print "evaluate2 affix(regExp(default unknown)) = accoracy unkown words: %f ,accuracy known words: " % e[ 0], e[1] e = ut3.evaluate2(brown_test) print "evaluate2 unigram(affix(regExp(default unknown))) = accoracy unkown words: %f ,accuracy known words: " % e[ 0], e[1] e = ct2.evaluate2(brown_test) print "evaluate2 bigram(unigram(affix(regExp(default unknown)))) = accoracy unkown words: %f ,accuracy known words: " % e[ 0], e[1]
# -*- coding: utf-8 -*- import nltk from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] print('Ngram Tagger Ecaluate Score') print(' train_sents test_sents') for i in range(1, 7): ngram_tagger = nltk.NgramTagger(i, train_sents) print('i=%d %.4f %.4f' % (i, ngram_tagger.evaluate(train_sents), ngram_tagger.evaluate(test_sents)))
def process_text(text): text = text.lower() # Tokenizing tokens = [ token for token in tokenizer.tokenize(text) if token not in stopwords ] # Stemming tokens = map(stemmer.stem, tokens) # # Lemmatizing # tokens = map(lemmatizer.lemmatize, tokens) return tokens if __name__ == '__main__': df = pd.read_csv('dataset.csv', nrows=80000, error_bad_lines=False) tagged_tokens = [] for tweet, sentiment in df[['SentimentText', 'Sentiment']].values: tokens = process_text(tweet) if tokens: tagged_tokens.append([(token, sentiment) for token in tokens]) training_data = tagged_tokens[:len(tagged_tokens) * 3 / 4] test_data = tagged_tokens[len(tagged_tokens) * 3 / 4:] tagger = nltk.NgramTagger(2, train=training_data) print tagger.evaluate(test_data)
def main(): #ploting the distribution graph # getDistSentByLength() ############################################################# #cycle of training-testing First case - Random split 90%-10%# ############################################################# train, test = stratifiedSamples([getAllTaggedCorpus()], 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Random Split= " ,ct2.evaluate(test) ############################################################################################### #cycle of training-testing second case - Stratified split 90%-10% according to sentence length# ############################################################################################### classes = divideToLengthClasses() train, test = stratifiedSamples(classes, 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Length split = " ,ct2.evaluate(test) ################################################################################################# #cycle of training-testing Third case - Stratified split 90%-10% according to the sentence genre# ################################################################################################# classes = divideToGenereClasses() train, test = stratifiedSamples(classes, 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Genere split = " ,ct2.evaluate(test)
(r"^[nN]est[ae]s?$", "ADP"), (r"^[nN]um$", "ADP"), (r"^[nN]ess[ae]s?$", "ADP"), (r"^[nN]aquel[ae]s?$", "ADP"), (r"^\xe0$", "ADP"), ] tagger = nltk.RegexpTagger(regex_patterns, backoff = nltk.NgramTagger(10, traindata, backoff = nltk.NgramTagger(9, traindata, backoff = nltk.NgramTagger(8, traindata, backoff = nltk.NgramTagger(7, traindata, backoff = nltk.NgramTagger(6, traindata, backoff = nltk.NgramTagger(5, traindata, backoff = nltk.NgramTagger(4, traindata, backoff = nltk.NgramTagger(3, traindata, backoff = nltk.NgramTagger(2, traindata, backoff=nltk.UnigramTagger(traindata, backoff=nltk.AffixTagger(traindata, affix_length=-4, backoff=nltk.DefaultTagger("NOUN") )))))))))))) templates = nltk.brill.fntbl37() tagger = nltk.BrillTaggerTrainer(tagger, templates) tagger = tagger.train(traindata, max_rules=100) with open("tagger_2.pkl", "wb") as f: pickle.dump(tagger, f)
# Concept of N-Gram tagging from nltk.corpus import brown import nltk brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') tagger = nltk.NgramTagger(len(brown_tagged_sents), train=brown_tagged_sents) print( tagger.tag( nltk.word_tokenize('We are using the programming language Python'))) print(tagger.evaluate(brown_tagged_sents))
default_tagger = DefaultTagger(most_common_tag) def_tagged_barack = default_tagger.tag(tokenised_barack) print(def_tagged_barack) print( "____________________Lookup Taggers_____________________________________") #lookup taggers #Ngarm Taggers Context dependent taggers sent1 = "the quick brown fox jumps over the lazy dog" training_tags = nltk.pos_tag(nltk.word_tokenize(sent1)) print(training_tags) print(list(nltk.ngrams(nltk.word_tokenize(sent1), 2))) #now use these tags to train Ngarms tagger ngarm_tagger = nltk.NgramTagger(n=2, train=[training_tags]) print(ngarm_tagger) sent2 = "the lazy dog was jumped over by the quick brown fox" training_tags_sent2 = nltk.pos_tag(nltk.word_tokenize(sent2)) print(list(nltk.ngrams(nltk.word_tokenize(sent2), 2))) #print(training_tags_sent2) sent2_taggers = ngarm_tagger.tag(nltk.word_tokenize(sent2)) print(sent2_taggers) print("________________unigrams Taggers_____________________________________") #unigrams tagger bush ="George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President of the United States" \ " from 2001 to 2009. He had previously served as the 46th Governor of Texas from 1995 to 2000.Bush was born in New Haven, " \ "Connecticut, and grew up in Texas. After graduating from Yale University in 1968 and Harvard Business School in 1975, he" \
print(unigram_tagger.tag(nltk.word_tokenize('I am studying NLP'))) #None for unseen word print(unigram_tagger.evaluate(brown_tagged_sents)) #Separating Training and Testing size = int(len(brown_tagged_sents) * 0.9) train = brown_tagged_sents[:size] test = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train) print(unigram_tagger.evaluate(test)) ######################################################## ############### NGram Tagger ############### ######################################################## #Judges the tag based on the other N-1 tags, analyzes word and context brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') #Ngram tagger - expects a value of N - num of tokes to judge the tag ngram_tagger = nltk.NgramTagger(4, train=brown_tagged_sents) print(ngram_tagger.tag(nltk.word_tokenize('We are studying NLP')))
data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca] for sentenca in sentencas_floresta if sentenca] sentencas_mac_morpho = nltk.corpus.mac_morpho.tagged_sents() data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca] for sentenca in sentencas_mac_morpho if sentenca] base = data teste = data print('Treinando tagger. Isso pode demorar...') _tagger = nltk.NgramTagger(4, base, backoff=nltk.TrigramTagger( base, backoff=nltk.BigramTagger( base, backoff=nltk.UnigramTagger( base, backoff=nltk.DefaultTagger('n'))))) print('Tagger treinado com sucesso! Precisão de %.1f!' % (_tagger.evaluate(teste) * 100)) try: print('Salvando tagger...') output = open(CAMINHO_DUMP, 'wb') dump(_tagger, output, -1) output.close()
def __init__(self, n, train_sents): train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.NgramTagger(n, train_data)
conv_sent = [] for word, tag in sentence: conv_sent.append((word, tag)) conv_data.append(conv_sent) return conv_data os.chdir('C:\CourseWork\Term5') train = json.load(open('train.txt')) test = json.load(open('test.txt')) train_data = convert_data(train) test_data = convert_data(test) default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.NgramTagger(1, train_data, backoff=default_tagger) unigram_tagger.evaluate(test_data) bigram_tagger = nltk.NgramTagger(2, train_data, backoff=unigram_tagger) bigram_tagger.evaluate(test_data) trigram_tagger = nltk.NgramTagger(3, train_data, backoff=bigram_tagger) trigram_tagger.evaluate(test_data) fourgram_tagger = nltk.NgramTagger(4, train_data, backoff=trigram_tagger) fourgram_tagger.evaluate(test_data) fivegram_tagger = nltk.NgramTagger(5, train_data, backoff=fourgram_tagger) fivegram_tagger.evaluate(test_data)
def main(): nltk.TaggerI.MicroEvaluate = MicroEvaluate brown_news_tagged = brown.tagged_sents(categories='news') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ], backoff=nn_tagger) at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(brown_train, backoff=at2) ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3) print "evaluate default nn = ", nn_tagger.evaluate(brown_test) print "evaluate regExp(default nn) = ", regexp_tagger.evaluate(brown_test) print "evaluate affix(regExp(default nn)) = ", at2.evaluate(brown_test) print "evaluate unigram(affix(regExp(default nn))) = ", ut3.evaluate( brown_test) print "evaluate bigram(unigram(affix(regExp(default nn)))) = ", ct2.evaluate( brown_test) print "" print "micro-evaluate default nn = ", nn_tagger.MicroEvaluate(brown_test) print "micro-evaluate regExp(default nn) = ", regexp_tagger.MicroEvaluate( brown_test) print "micro-evaluate affix(regExp(default nn)) = ", at2.MicroEvaluate( brown_test) print "micro-evaluate unigram(affix(regExp(default nn))) = ", ut3.MicroEvaluate( brown_test) print "micro-evaluate bigram(unigram(affix(regExp(default nn)))) = ", ct2.MicroEvaluate( brown_test) print "" print "default nn prec tag = AT => ", checkTaggerPrecForTag( nn_tagger, 'AT', brown_test) print "default nn recall tag = AT => ", checkTaggerRecallForTag( nn_tagger, 'AT', brown_test) print "" print "default nn prec tag = NN => ", checkTaggerPrecForTag( nn_tagger, 'NN', brown_test) print "default nn recall tag = NN => ", checkTaggerRecallForTag( nn_tagger, 'NN', brown_test) print "" print "4 most difficult tags in simplified tagsSet - bigramTagger with all the backoffs:", checkSimplifiedDifficultTags( "BigramTagger", 4) print "4 most difficult tags in full tagsSet - bigramTagger with all the backoffs: ", checkFullDifficultTags( ct2, brown_test, 4) print ""
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017. A member of the Democratic Party, he was the first African American to assume the presidency and previously served as a United States Senator from Illinois (2005–2008).""" tokenized_barack = word_tokenize(barack) default_tagger = DefaultTagger(most_common_tag) def_tagged_barack = default_tagger.tag(tokenized_barack) print(def_tagged_barack) #Lookup Tagger #Ngram tagger message = "the quick brown fox jumped over the lazy dog" training_tag = pos_tag(word_tokenize(message)) print(training_tag) #training the ngram tagger ngram_tagger = nltk.NgramTagger(n=2, train=[training_tag]) message2 = "the lazy dog jumped over the quick brown fox" message2_tags = ngram_tagger.tag(word_tokenize(message2)) print(message2_tags) print(list(nltk.ngrams(pos_tag(word_tokenize(message)), n=2))) #Unigram tagger barack = """Barack Hussein Obama II born August 4, 1961) is an American politician who served as the 44th President of the United States from January 20, 2009, to January 20, 2017. A member of the Democratic Party, he was the first African American to assume the presidency and previously served as a United States Senator from Illinois (2005–2008).""" bush = """George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President