def train_es_tagger(path): nltk.download('cess_esp') def convert_to_universal_tag(t): tagdict = { 'Fa': '.', 'Faa': '.', 'Fat': '.', 'Fc': '.', 'Fd': '.', 'Fe': '.', 'Fg': '.', 'Fh': '.', 'Fi': '.', 'Fia': '.', 'Fit': '.', 'Fp': '.', 'Fpa': '.', 'Fpt': '.', 'Fs': '.', 'Fx': '.', 'Fz': '.', 'X': 'X', 'Y': 'X', 'Zm': 'NUM', 'Zp': 'NUM', 'ao': 'ADJ', 'ao0fp0': 'ADJ', 'ao0fs0': 'ADJ', 'ao0mp0': 'ADJ', 'ao0ms0': 'ADJ', 'aq': 'ADJ', 'aq00000': 'ADJ', 'aq0cn0': 'ADJ', 'aq0cp0': 'ADJ', 'aq0cs0': 'ADJ', 'aq0fp0': 'ADJ', 'aq0fpp': 'ADJ', 'aq0fs0': 'ADJ', 'aq0fsp': 'ADJ', 'aq0mp0': 'ADJ', 'aq0mpp': 'ADJ', 'aq0ms0': 'ADJ', 'aq0msp': 'ADJ', 'cc': 'CONJ', 'cs': 'CONJ', 'da': 'DET', 'da0fp0': 'DET', 'da0fs0': 'DET', 'da0mp0': 'DET', 'da0ms0': 'DET', 'da0ns0': 'DET', 'dd': 'DET', 'dd0cp0': 'DET', 'dd0cs0': 'DET', 'dd0fp0': 'DET', 'dd0fs0': 'DET', 'dd0mp0': 'DET', 'dd0ms0': 'DET', 'de': 'DET', 'de0cn0': 'DET', 'di': 'DET', 'di0cp0': 'DET', 'di0cs0': 'DET', 'di0fp0': 'DET', 'di0fs0': 'DET', 'di0mp0': 'DET', 'di0ms0': 'DET', 'dn': 'DET', 'dn0cp0': 'DET', 'dn0cs0': 'DET', 'dn0fp0': 'DET', 'dn0fs0': 'DET', 'dn0mp0': 'DET', 'dn0ms0': 'DET', 'dp': 'DET', 'dp1cps': 'DET', 'dp1css': 'DET', 'dp1fpp': 'DET', 'dp1fsp': 'DET', 'dp1mpp': 'DET', 'dp1msp': 'DET', 'dp1mss': 'DET', 'dp2cps': 'DET', 'dp2css': 'DET', 'dp3cp0': 'DET', 'dp3cs0': 'DET', 'dp3fs0': 'DET', 'dp3mp0': 'DET', 'dp3ms0': 'DET', 'dt': 'DET', 'dt0cn0': 'DET', 'dt0fs0': 'DET', 'dt0ms0': 'DET', 'i': 'X', 'nc': 'NOUN', 'nc00000': 'NOUN', 'nccn000': 'NOUN', 'nccp000': 'NOUN', 'nccs000': 'NOUN', 'ncfn000': 'NOUN', 'ncfp000': 'NOUN', 'ncfs000': 'NOUN', 'ncmn000': 'NOUN', 'ncmp000': 'NOUN', 'ncms000': 'NOUN', 'np': 'NOUN', 'np00000': 'NOUN', 'np0000a': 'NOUN', 'np0000l': 'NOUN', 'np0000o': 'NOUN', 'np0000p': 'NOUN', 'p0': 'PRON', 'p0000000': 'PRON', 'p010p000': 'PRON', 'p010s000': 'PRON', 'p020s000': 'PRON', 'p0300000': 'PRON', 'pd': 'PRON', 'pd0cp000': 'PRON', 'pd0cs000': 'PRON', 'pd0fp000': 'PRON', 'pd0fs000': 'PRON', 'pd0mp000': 'PRON', 'pd0ms000': 'PRON', 'pd0ns000': 'PRON', 'pe': 'PRON', 'pe000000': 'PRON', 'pi': 'PRON', 'pi0cp000': 'PRON', 'pi0cs000': 'PRON', 'pi0fp000': 'PRON', 'pi0fs000': 'PRON', 'pi0mp000': 'PRON', 'pi0ms000': 'PRON', 'pn': 'PRON', 'pn0cp000': 'PRON', 'pn0fp000': 'PRON', 'pn0fs000': 'PRON', 'pn0mp000': 'PRON', 'pn0ms000': 'PRON', 'pp': 'PRON', 'pp1cp000': 'PRON', 'pp1cs000': 'PRON', 'pp1csn00': 'PRON', 'pp1cso00': 'PRON', 'pp1mp000': 'PRON', 'pp2cp000': 'PRON', 'pp2cp00p': 'PRON', 'pp2cs000': 'PRON', 'pp2cs00p': 'PRON', 'pp2csn00': 'PRON', 'pp2cso00': 'PRON', 'pp3cn000': 'PRON', 'pp3cna00': 'PRON', 'pp3cno00': 'PRON', 'pp3cpa00': 'PRON', 'pp3cpd00': 'PRON', 'pp3csa00': 'PRON', 'pp3csd00': 'PRON', 'pp3fp000': 'PRON', 'pp3fpa00': 'PRON', 'pp3fs000': 'PRON', 'pp3fsa00': 'PRON', 'pp3mp000': 'PRON', 'pp3mpa00': 'PRON', 'pp3ms000': 'PRON', 'pp3msa00': 'PRON', 'pp3ns000': 'PRON', 'pr': 'PRON', 'pr000000': 'PRON', 'pr0cn000': 'PRON', 'pr0cp000': 'PRON', 'pr0cs000': 'PRON', 'pr0fp000': 'PRON', 'pr0fs000': 'PRON', 'pr0mp000': 'PRON', 'pr0ms000': 'PRON', 'pt': 'PRON', 'pt000000': 'PRON', 'pt0cp000': 'PRON', 'pt0cs000': 'PRON', 'pt0mp000': 'PRON', 'pt0ms000': 'PRON', 'px': 'PRON', 'px1fp0p0': 'PRON', 'px1fs0p0': 'PRON', 'px1mp0p0': 'PRON', 'px1ms0p0': 'PRON', 'px2fs0s0': 'PRON', 'px3fs000': 'PRON', 'px3mp000': 'PRON', 'px3ms000': 'PRON', 'px3ns000': 'PRON', 'rg': 'ADV', 'rn': 'ADV', 'sn': 'ADP', 'sn-SUJ': 'ADP', 'sn.co-SUJ': 'ADP', 'sn.e': 'ADP', 'sn.e-ATR': 'ADP', 'sn.e-CD': 'ADP', 'sn.e-SUJ': 'ADP', 'sn.e.1n-SUJ': 'ADP', 'sp': 'ADP', 'spcms': 'ADP', 'sps00': 'ADP', 'va': 'VERB', 'vag0000': 'VERB', 'vaic1p0': 'VERB', 'vaic3p0': 'VERB', 'vaic3s0': 'VERB', 'vaif1p0': 'VERB', 'vaif2s0': 'VERB', 'vaif3p0': 'VERB', 'vaif3s0': 'VERB', 'vaii1p0': 'VERB', 'vaii1s0': 'VERB', 'vaii2s0': 'VERB', 'vaii3p0': 'VERB', 'vaii3s0': 'VERB', 'vaip1p0': 'VERB', 'vaip1s0': 'VERB', 'vaip2p0': 'VERB', 'vaip2s0': 'VERB', 'vaip3p0': 'VERB', 'vaip3s0': 'VERB', 'vais3s0': 'VERB', 'vam02s0': 'VERB', 'vam03s0': 'VERB', 'van0000': 'VERB', 'vap00sm': 'VERB', 'vasi1p0': 'VERB', 'vasi1s0': 'VERB', 'vasi3p0': 'VERB', 'vasi3s0': 'VERB', 'vasp1s0': 'VERB', 'vasp3p0': 'VERB', 'vasp3s0': 'VERB', 'vm': 'VERB', 'vmg0000': 'VERB', 'vmic1p0': 'VERB', 'vmic1s0': 'VERB', 'vmic2s0': 'VERB', 'vmic3p0': 'VERB', 'vmic3s0': 'VERB', 'vmif1p0': 'VERB', 'vmif1s0': 'VERB', 'vmif2s0': 'VERB', 'vmif3p0': 'VERB', 'vmif3s0': 'VERB', 'vmii1p0': 'VERB', 'vmii1s0': 'VERB', 'vmii2p0': 'VERB', 'vmii2s0': 'VERB', 'vmii3p0': 'VERB', 'vmii3s0': 'VERB', 'vmip1p0': 'VERB', 'vmip1s0': 'VERB', 'vmip2p0': 'VERB', 'vmip2s0': 'VERB', 'vmip3p0': 'VERB', 'vmip3s0': 'VERB', 'vmis1p0': 'VERB', 'vmis1s0': 'VERB', 'vmis2s0': 'VERB', 'vmis3p0': 'VERB', 'vmis3s0': 'VERB', 'vmm01p0': 'VERB', 'vmm02s0': 'VERB', 'vmm03p0': 'VERB', 'vmm03s0': 'VERB', 'vmn0000': 'VERB', 'vmp00pf': 'VERB', 'vmp00pm': 'VERB', 'vmp00sf': 'VERB', 'vmp00sm': 'VERB', 'vmsi1p0': 'VERB', 'vmsi1s0': 'VERB', 'vmsi3p0': 'VERB', 'vmsi3s0': 'VERB', 'vmsp1p0': 'VERB', 'vmsp1s0': 'VERB', 'vmsp2p0': 'VERB', 'vmsp2s0': 'VERB', 'vmsp3p0': 'VERB', 'vmsp3s0': 'VERB', 'vs': 'VERB', 'vsg0000': 'VERB', 'vsic1s0': 'VERB', 'vsic2s0': 'VERB', 'vsic3p0': 'VERB', 'vsic3s0': 'VERB', 'vsif1s0': 'VERB', 'vsif3p0': 'VERB', 'vsif3s0': 'VERB', 'vsii1p0': 'VERB', 'vsii1s0': 'VERB', 'vsii3p0': 'VERB', 'vsii3s0': 'VERB', 'vsip1p0': 'VERB', 'vsip1s0': 'VERB', 'vsip2s0': 'VERB', 'vsip3p0': 'VERB', 'vsip3s0': 'VERB', 'vsis1s0': 'VERB', 'vsis3p0': 'VERB', 'vsis3s0': 'VERB', 'vsm03s0': 'VERB', 'vsn0000': 'VERB', 'vsp00sm': 'VERB', 'vssf3s0': 'VERB', 'vssi3p0': 'VERB', 'vssi3s0': 'VERB', 'vssp1s0': 'VERB', 'vssp2s0': 'VERB', 'vssp3p0': 'VERB', 'vssp3s0': 'VERB', 'w': 'NOUN', 'z': 'NUM' } t = t.lower() return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t) cess = [[(w, convert_to_universal_tag(t)) for (w, t) in sent] for sent in nltk.corpus.cess_esp.tagged_sents()] shuffle(cess) def_tagger = nltk.DefaultTagger('NOUN') affix_tagger = nltk.AffixTagger(cess, backoff=def_tagger) unitagger = nltk.UnigramTagger(cess, backoff=affix_tagger) tagger = nltk.BigramTagger(cess, backoff=unitagger) tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37()) tagger = tagger.train(cess, max_rules=100) with open(path, "wb") as f: pickle.dump(tagger, f) return tagger
for (w, t) in sent] for sent in dataset2] shuffle(traindata) shuffle(traindata2) regex_patterns = [ (r"^[nN][ao]s?$", "ADP"), (r"^[dD][ao]s?$", "ADP"), (r"^[pP]el[ao]s?$", "ADP"), (r"^[nN]est[ae]s?$", "ADP"), (r"^[nN]um$", "ADP"), (r"^[nN]ess[ae]s?$", "ADP"), (r"^[nN]aquel[ae]s?$", "ADP"), (r"^\xe0$", "ADP"), ] tagger = nltk.BigramTagger(traindata, backoff=nltk.RegexpTagger( regex_patterns, backoff=nltk.UnigramTagger( traindata2, backoff=nltk.AffixTagger( traindata2, backoff=nltk.DefaultTagger('NOUN'))))) templates = nltk.brill.fntbl37() tagger = nltk.BrillTaggerTrainer(tagger, templates) tagger = tagger.train(traindata, max_rules=100) with open("tagger.pkl", "wb") as f: pickle.dump(tagger, f)
cont = 0 for linia in diccionario: #cont+=1 #if cont==10000: # break linia = linia.rstrip() camps = linia.split(":") if len(camps) >= 3: forma = camps[0] lema = camps[1] etiqueta = camps[2] tupla = (forma, etiqueta) tagged_words.append(tupla) tagged_sents_per_unigrams.append(tagged_words) default_tagger = nltk.DefaultTagger("NP00000") affix_tagger = nltk.AffixTagger(tagged_sents_per_unigrams, affix_length=-3, min_stem_length=2, backoff=default_tagger) unigram_tagger_diccionari = nltk.UnigramTagger(tagged_sents_per_unigrams, backoff=affix_tagger) unigram_tagger = nltk.UnigramTagger(tagged_sents, backoff=unigram_tagger_diccionari) bigram_tagger = nltk.BigramTagger(tagged_sents, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(tagged_sents, backoff=bigram_tagger) sortida = open('etiquetador-spa.pkl', 'wb') pickle.dump(trigram_tagger, sortida, -1) sortida.close()
import codecs import nltk entrada=codecs.open("diccionari-cat.txt","r",encoding="utf-8") tagged_words=[] tagged_sents=[] cont=0 for linia in entrada: cont+=1 if cont==10000: break linia=linia.rstrip() camps=linia.split(" ") forma=camps[0] lema=camps[1] etiqueta=camps[2] tupla=(forma,etiqueta) tagged_words.append(tupla) tagged_sents.append(tagged_words) affix_tagger=nltk.AffixTagger(tagged_sents, affix_length=-3, min_stem_length=2)
t2 = time.time() print "tagger.pickle: unigram size %d, AO-I-PIKCELER %.1f" % ( unigram_tagger_2.size(), t2 - t1) bigram_tagger = nltk.BigramTagger(conll_train, backoff=unigram_tagger_2) cPickle.dump(bigram_tagger, file('bigram.pickle', 'w'), 2) t22 = time.time() print "AO-I-PIKCELER: bigram size %d, AO-I-PIKCELER %.1f" % ( bigram_tagger.size(), t22 - t2) trigram_tagger = nltk.TrigramTagger(conll_train, backoff=bigram_tagger) cPickle.dump(trigram_tagger, file('trigram.pickle', 'w'), 2) t23 = time.time() print "AO-I-PIKCELER: trigram size %d, AO-I-PIKCELER %.1f" % ( trigram_tagger.size(), t23 - t22) default_tagger = nltk.DefaultTagger('NN') affix_tagger = nltk.AffixTagger(conll_train, affix_length=-3, min_stem_length=2, backoff=default_tagger) cPickle.dump(affix_tagger, file('affix.pickle', 'w'), 2) t24 = time.time() print "AO-I-PIKCELER: affix size %d, AO-I-PIKCELER %.1f" % ( affix_tagger.size(), t24 - t23) trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger_2, templates=templates, trace=3, deterministic=True) t3 = time.time() print "AO-I-PIKCELER: trainer AO-I-PIKCELER %.1f" % (t3 - t2) tagger = trainer.train(conll_train, max_rules=10) cPickle.dump(trainer, file('trainer.pickle', 'w'), 2) cPickle.dump(tagger, file('tagger.pickle', 'w'), 2)
train_sents = brown_news_tagged[:size] test_sents = brown_news_tagged[size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents) test_tags = [ tag for sent in brown.sents(categories='editorial') for (word, tag) in t2.tag(sent) ] gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')] print(nltk.ConfusionMatrix(gold_tags, test_tags)) affix_tagger = nltk.AffixTagger(brown_tagged_sents, affix_length=2, min_stem_length=0) affix_tagger.tag( nltk.word_tokenize( 'The Road to Emmaus appearance is one of the early resurrection appearances of Jesus after his crucifixion and the discovery of the empty tomb. Both the Meeting on the road to Emmaus and the subsequent Supper at Emmaus, depicting the meal that Jesus had with two disciples after the encounter on the road, have been popular subjects in art.' )) date_dict = {'day': '22', 'month': 'April', 'month_num': '04', 'year': '2017'} print('{day} of {month}, {year}.'.format(**date_dict)) print('{month_num}/{day}/{year}'.format(**date_dict)) brown_tagged_words = brown.tagged_words(categories='news') relevant_pairs = [ pair for pair in brown_tagged_words if pair[1] in ['NN', 'NNS'] ] fdist = nltk.FreqDist(relevant_pairs)
(r"^[nN]est[ae]s?$", "ADP"), (r"^[nN]um$", "ADP"), (r"^[nN]ess[ae]s?$", "ADP"), (r"^[nN]aquel[ae]s?$", "ADP"), (r"^\xe0$", "ADP"), ] tagger = nltk.RegexpTagger(regex_patterns, backoff = nltk.NgramTagger(10, traindata, backoff = nltk.NgramTagger(9, traindata, backoff = nltk.NgramTagger(8, traindata, backoff = nltk.NgramTagger(7, traindata, backoff = nltk.NgramTagger(6, traindata, backoff = nltk.NgramTagger(5, traindata, backoff = nltk.NgramTagger(4, traindata, backoff = nltk.NgramTagger(3, traindata, backoff = nltk.NgramTagger(2, traindata, backoff=nltk.UnigramTagger(traindata, backoff=nltk.AffixTagger(traindata, affix_length=-4, backoff=nltk.DefaultTagger("NOUN") )))))))))))) templates = nltk.brill.fntbl37() tagger = nltk.BrillTaggerTrainer(tagger, templates) tagger = tagger.train(traindata, max_rules=100) with open("tagger_2.pkl", "wb") as f: pickle.dump(tagger, f)
def main(): #ploting the distribution graph # getDistSentByLength() ############################################################# #cycle of training-testing First case - Random split 90%-10%# ############################################################# train, test = stratifiedSamples([getAllTaggedCorpus()], 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Random Split= " ,ct2.evaluate(test) ############################################################################################### #cycle of training-testing second case - Stratified split 90%-10% according to sentence length# ############################################################################################### classes = divideToLengthClasses() train, test = stratifiedSamples(classes, 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Length split = " ,ct2.evaluate(test) ################################################################################################# #cycle of training-testing Third case - Stratified split 90%-10% according to the sentence genre# ################################################################################################# classes = divideToGenereClasses() train, test = stratifiedSamples(classes, 10) nn_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ],backoff=nn_tagger) at2 = nltk.AffixTagger(train, backoff=regexp_tagger) ut3 = nltk.UnigramTagger(train, backoff=at2) ct2 = nltk.NgramTagger(2, train, backoff=ut3) print "evaluate bigram(unigram(affix(regExp(default nn)))) Genere split = " ,ct2.evaluate(test)
# ☼ Learn about the affix tagger (type help(nltk.AffixTagger)). Train an affix tagger and run it on some new text. Experiment with different settings for the affix length and the minimum word length. Discuss your findings. import nltk from nltk.corpus import brown from nltk.corpus import gutenberg text = gutenberg.words('austen-persuasion.txt') brown_sents = brown.sents(categories='news') brown_tagged_sents = brown.tagged_sents(categories='news') affix_tagger = nltk.AffixTagger(train=brown_tagged_sents, affix_length=1, min_stem_length=3) print(affix_tagger.tag(text))
def train_pt_tagger(path): nltk.download('mac_morpho') nltk.download('floresta') def convert_to_universal_tag(t, reverse=False): tagdict = { 'n': "NOUN", 'num': "NUM", 'v-fin': "VERB", 'v-inf': "VERB", 'v-ger': "VERB", 'v-pcp': "VERB", 'pron-det': "PRON", 'pron-indp': "PRON", 'pron-pers': "PRON", 'art': "DET", 'adv': "ADV", 'conj-s': "CONJ", 'conj-c': "CONJ", 'conj-p': "CONJ", 'adj': "ADJ", 'ec': "PRT", 'pp': "ADP", 'prp': "ADP", 'prop': "NOUN", 'pro-ks-rel': "PRON", 'proadj': "PRON", 'prep': "ADP", 'nprop': "NOUN", 'vaux': "VERB", 'propess': "PRON", 'v': "VERB", 'vp': "VERB", 'in': "X", 'prp-': "ADP", 'adv-ks': "ADV", 'dad': "NUM", 'prosub': "PRON", 'tel': "NUM", 'ap': "NUM", 'est': "NOUN", 'cur': "X", 'pcp': "VERB", 'pro-ks': "PRON", 'hor': "NUM", 'pden': "ADV", 'dat': "NUM", 'kc': "ADP", 'ks': "ADP", 'adv-ks-rel': "ADV", 'npro': "NOUN", } if t in ["N|AP", "N|DAD", "N|DAT", "N|HOR", "N|TEL"]: t = "NUM" if reverse: if "|" in t: t = t.split("|")[0] else: if "+" in t: t = t.split("+")[1] if "|" in t: t = t.split("|")[1] if "#" in t: t = t.split("#")[0] t = t.lower() return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t) floresta = [[(w, convert_to_universal_tag(t)) for (w, t) in sent] for sent in nltk.corpus.floresta.tagged_sents()] shuffle(floresta) mac_morpho = [[w[0] for w in sent] for sent in nltk.corpus.mac_morpho.tagged_paras()] mac_morpho = [[(w, convert_to_universal_tag(t, reverse=True)) for (w, t) in sent] for sent in mac_morpho] shuffle(mac_morpho) regex_patterns = [ (r"^[nN][ao]s?$", "ADP"), (r"^[dD][ao]s?$", "ADP"), (r"^[pP]el[ao]s?$", "ADP"), (r"^[nN]est[ae]s?$", "ADP"), (r"^[nN]um$", "ADP"), (r"^[nN]ess[ae]s?$", "ADP"), (r"^[nN]aquel[ae]s?$", "ADP"), (r"^\xe0$", "ADP"), ] def_tagger = nltk.DefaultTagger('NOUN') affix_tagger = nltk.AffixTagger(mac_morpho + floresta, backoff=def_tagger) unitagger = nltk.UnigramTagger(mac_morpho + floresta, backoff=affix_tagger) rx_tagger = nltk.RegexpTagger(regex_patterns, backoff=unitagger) tagger = nltk.BigramTagger(floresta, backoff=rx_tagger) tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37()) tagger = tagger.train(floresta, max_rules=100) with open(path, "wb") as f: pickle.dump(tagger, f) return tagger
# -*- coding: utf-8 -*- # 11, 13, 15, 17, 27, 34, 36, 39, 40, 43 ################## # 11 - Learn about the affix tagger (type help(nltk.AffixTagger)). Train an affix tagger and run it on some new text. # Experiment with different settings for the affix length and the minimum word length. Discuss your findings. ################## import nltk from nltk.corpus import brown tagged = brown.tagged_sents(categories='news') gold = brown.tagged_sents(categories='hobbies') words = brown.words(categories='hobbies') AffTagger = nltk.AffixTagger(tagged, affix_length=-3, min_stem_length=1) print(AffTagger.evaluate(gold))