Ejemplo n.º 1
0
def train_es_tagger(path):
    nltk.download('cess_esp')

    def convert_to_universal_tag(t):
        tagdict = {
            'Fa': '.',
            'Faa': '.',
            'Fat': '.',
            'Fc': '.',
            'Fd': '.',
            'Fe': '.',
            'Fg': '.',
            'Fh': '.',
            'Fi': '.',
            'Fia': '.',
            'Fit': '.',
            'Fp': '.',
            'Fpa': '.',
            'Fpt': '.',
            'Fs': '.',
            'Fx': '.',
            'Fz': '.',
            'X': 'X',
            'Y': 'X',
            'Zm': 'NUM',
            'Zp': 'NUM',
            'ao': 'ADJ',
            'ao0fp0': 'ADJ',
            'ao0fs0': 'ADJ',
            'ao0mp0': 'ADJ',
            'ao0ms0': 'ADJ',
            'aq': 'ADJ',
            'aq00000': 'ADJ',
            'aq0cn0': 'ADJ',
            'aq0cp0': 'ADJ',
            'aq0cs0': 'ADJ',
            'aq0fp0': 'ADJ',
            'aq0fpp': 'ADJ',
            'aq0fs0': 'ADJ',
            'aq0fsp': 'ADJ',
            'aq0mp0': 'ADJ',
            'aq0mpp': 'ADJ',
            'aq0ms0': 'ADJ',
            'aq0msp': 'ADJ',
            'cc': 'CONJ',
            'cs': 'CONJ',
            'da': 'DET',
            'da0fp0': 'DET',
            'da0fs0': 'DET',
            'da0mp0': 'DET',
            'da0ms0': 'DET',
            'da0ns0': 'DET',
            'dd': 'DET',
            'dd0cp0': 'DET',
            'dd0cs0': 'DET',
            'dd0fp0': 'DET',
            'dd0fs0': 'DET',
            'dd0mp0': 'DET',
            'dd0ms0': 'DET',
            'de': 'DET',
            'de0cn0': 'DET',
            'di': 'DET',
            'di0cp0': 'DET',
            'di0cs0': 'DET',
            'di0fp0': 'DET',
            'di0fs0': 'DET',
            'di0mp0': 'DET',
            'di0ms0': 'DET',
            'dn': 'DET',
            'dn0cp0': 'DET',
            'dn0cs0': 'DET',
            'dn0fp0': 'DET',
            'dn0fs0': 'DET',
            'dn0mp0': 'DET',
            'dn0ms0': 'DET',
            'dp': 'DET',
            'dp1cps': 'DET',
            'dp1css': 'DET',
            'dp1fpp': 'DET',
            'dp1fsp': 'DET',
            'dp1mpp': 'DET',
            'dp1msp': 'DET',
            'dp1mss': 'DET',
            'dp2cps': 'DET',
            'dp2css': 'DET',
            'dp3cp0': 'DET',
            'dp3cs0': 'DET',
            'dp3fs0': 'DET',
            'dp3mp0': 'DET',
            'dp3ms0': 'DET',
            'dt': 'DET',
            'dt0cn0': 'DET',
            'dt0fs0': 'DET',
            'dt0ms0': 'DET',
            'i': 'X',
            'nc': 'NOUN',
            'nc00000': 'NOUN',
            'nccn000': 'NOUN',
            'nccp000': 'NOUN',
            'nccs000': 'NOUN',
            'ncfn000': 'NOUN',
            'ncfp000': 'NOUN',
            'ncfs000': 'NOUN',
            'ncmn000': 'NOUN',
            'ncmp000': 'NOUN',
            'ncms000': 'NOUN',
            'np': 'NOUN',
            'np00000': 'NOUN',
            'np0000a': 'NOUN',
            'np0000l': 'NOUN',
            'np0000o': 'NOUN',
            'np0000p': 'NOUN',
            'p0': 'PRON',
            'p0000000': 'PRON',
            'p010p000': 'PRON',
            'p010s000': 'PRON',
            'p020s000': 'PRON',
            'p0300000': 'PRON',
            'pd': 'PRON',
            'pd0cp000': 'PRON',
            'pd0cs000': 'PRON',
            'pd0fp000': 'PRON',
            'pd0fs000': 'PRON',
            'pd0mp000': 'PRON',
            'pd0ms000': 'PRON',
            'pd0ns000': 'PRON',
            'pe': 'PRON',
            'pe000000': 'PRON',
            'pi': 'PRON',
            'pi0cp000': 'PRON',
            'pi0cs000': 'PRON',
            'pi0fp000': 'PRON',
            'pi0fs000': 'PRON',
            'pi0mp000': 'PRON',
            'pi0ms000': 'PRON',
            'pn': 'PRON',
            'pn0cp000': 'PRON',
            'pn0fp000': 'PRON',
            'pn0fs000': 'PRON',
            'pn0mp000': 'PRON',
            'pn0ms000': 'PRON',
            'pp': 'PRON',
            'pp1cp000': 'PRON',
            'pp1cs000': 'PRON',
            'pp1csn00': 'PRON',
            'pp1cso00': 'PRON',
            'pp1mp000': 'PRON',
            'pp2cp000': 'PRON',
            'pp2cp00p': 'PRON',
            'pp2cs000': 'PRON',
            'pp2cs00p': 'PRON',
            'pp2csn00': 'PRON',
            'pp2cso00': 'PRON',
            'pp3cn000': 'PRON',
            'pp3cna00': 'PRON',
            'pp3cno00': 'PRON',
            'pp3cpa00': 'PRON',
            'pp3cpd00': 'PRON',
            'pp3csa00': 'PRON',
            'pp3csd00': 'PRON',
            'pp3fp000': 'PRON',
            'pp3fpa00': 'PRON',
            'pp3fs000': 'PRON',
            'pp3fsa00': 'PRON',
            'pp3mp000': 'PRON',
            'pp3mpa00': 'PRON',
            'pp3ms000': 'PRON',
            'pp3msa00': 'PRON',
            'pp3ns000': 'PRON',
            'pr': 'PRON',
            'pr000000': 'PRON',
            'pr0cn000': 'PRON',
            'pr0cp000': 'PRON',
            'pr0cs000': 'PRON',
            'pr0fp000': 'PRON',
            'pr0fs000': 'PRON',
            'pr0mp000': 'PRON',
            'pr0ms000': 'PRON',
            'pt': 'PRON',
            'pt000000': 'PRON',
            'pt0cp000': 'PRON',
            'pt0cs000': 'PRON',
            'pt0mp000': 'PRON',
            'pt0ms000': 'PRON',
            'px': 'PRON',
            'px1fp0p0': 'PRON',
            'px1fs0p0': 'PRON',
            'px1mp0p0': 'PRON',
            'px1ms0p0': 'PRON',
            'px2fs0s0': 'PRON',
            'px3fs000': 'PRON',
            'px3mp000': 'PRON',
            'px3ms000': 'PRON',
            'px3ns000': 'PRON',
            'rg': 'ADV',
            'rn': 'ADV',
            'sn': 'ADP',
            'sn-SUJ': 'ADP',
            'sn.co-SUJ': 'ADP',
            'sn.e': 'ADP',
            'sn.e-ATR': 'ADP',
            'sn.e-CD': 'ADP',
            'sn.e-SUJ': 'ADP',
            'sn.e.1n-SUJ': 'ADP',
            'sp': 'ADP',
            'spcms': 'ADP',
            'sps00': 'ADP',
            'va': 'VERB',
            'vag0000': 'VERB',
            'vaic1p0': 'VERB',
            'vaic3p0': 'VERB',
            'vaic3s0': 'VERB',
            'vaif1p0': 'VERB',
            'vaif2s0': 'VERB',
            'vaif3p0': 'VERB',
            'vaif3s0': 'VERB',
            'vaii1p0': 'VERB',
            'vaii1s0': 'VERB',
            'vaii2s0': 'VERB',
            'vaii3p0': 'VERB',
            'vaii3s0': 'VERB',
            'vaip1p0': 'VERB',
            'vaip1s0': 'VERB',
            'vaip2p0': 'VERB',
            'vaip2s0': 'VERB',
            'vaip3p0': 'VERB',
            'vaip3s0': 'VERB',
            'vais3s0': 'VERB',
            'vam02s0': 'VERB',
            'vam03s0': 'VERB',
            'van0000': 'VERB',
            'vap00sm': 'VERB',
            'vasi1p0': 'VERB',
            'vasi1s0': 'VERB',
            'vasi3p0': 'VERB',
            'vasi3s0': 'VERB',
            'vasp1s0': 'VERB',
            'vasp3p0': 'VERB',
            'vasp3s0': 'VERB',
            'vm': 'VERB',
            'vmg0000': 'VERB',
            'vmic1p0': 'VERB',
            'vmic1s0': 'VERB',
            'vmic2s0': 'VERB',
            'vmic3p0': 'VERB',
            'vmic3s0': 'VERB',
            'vmif1p0': 'VERB',
            'vmif1s0': 'VERB',
            'vmif2s0': 'VERB',
            'vmif3p0': 'VERB',
            'vmif3s0': 'VERB',
            'vmii1p0': 'VERB',
            'vmii1s0': 'VERB',
            'vmii2p0': 'VERB',
            'vmii2s0': 'VERB',
            'vmii3p0': 'VERB',
            'vmii3s0': 'VERB',
            'vmip1p0': 'VERB',
            'vmip1s0': 'VERB',
            'vmip2p0': 'VERB',
            'vmip2s0': 'VERB',
            'vmip3p0': 'VERB',
            'vmip3s0': 'VERB',
            'vmis1p0': 'VERB',
            'vmis1s0': 'VERB',
            'vmis2s0': 'VERB',
            'vmis3p0': 'VERB',
            'vmis3s0': 'VERB',
            'vmm01p0': 'VERB',
            'vmm02s0': 'VERB',
            'vmm03p0': 'VERB',
            'vmm03s0': 'VERB',
            'vmn0000': 'VERB',
            'vmp00pf': 'VERB',
            'vmp00pm': 'VERB',
            'vmp00sf': 'VERB',
            'vmp00sm': 'VERB',
            'vmsi1p0': 'VERB',
            'vmsi1s0': 'VERB',
            'vmsi3p0': 'VERB',
            'vmsi3s0': 'VERB',
            'vmsp1p0': 'VERB',
            'vmsp1s0': 'VERB',
            'vmsp2p0': 'VERB',
            'vmsp2s0': 'VERB',
            'vmsp3p0': 'VERB',
            'vmsp3s0': 'VERB',
            'vs': 'VERB',
            'vsg0000': 'VERB',
            'vsic1s0': 'VERB',
            'vsic2s0': 'VERB',
            'vsic3p0': 'VERB',
            'vsic3s0': 'VERB',
            'vsif1s0': 'VERB',
            'vsif3p0': 'VERB',
            'vsif3s0': 'VERB',
            'vsii1p0': 'VERB',
            'vsii1s0': 'VERB',
            'vsii3p0': 'VERB',
            'vsii3s0': 'VERB',
            'vsip1p0': 'VERB',
            'vsip1s0': 'VERB',
            'vsip2s0': 'VERB',
            'vsip3p0': 'VERB',
            'vsip3s0': 'VERB',
            'vsis1s0': 'VERB',
            'vsis3p0': 'VERB',
            'vsis3s0': 'VERB',
            'vsm03s0': 'VERB',
            'vsn0000': 'VERB',
            'vsp00sm': 'VERB',
            'vssf3s0': 'VERB',
            'vssi3p0': 'VERB',
            'vssi3s0': 'VERB',
            'vssp1s0': 'VERB',
            'vssp2s0': 'VERB',
            'vssp3p0': 'VERB',
            'vssp3s0': 'VERB',
            'w': 'NOUN',
            'z': 'NUM'
        }
        t = t.lower()
        return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

    cess = [[(w, convert_to_universal_tag(t)) for (w, t) in sent]
            for sent in nltk.corpus.cess_esp.tagged_sents()]
    shuffle(cess)
    def_tagger = nltk.DefaultTagger('NOUN')
    affix_tagger = nltk.AffixTagger(cess, backoff=def_tagger)
    unitagger = nltk.UnigramTagger(cess, backoff=affix_tagger)
    tagger = nltk.BigramTagger(cess, backoff=unitagger)
    tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37())
    tagger = tagger.train(cess, max_rules=100)

    with open(path, "wb") as f:
        pickle.dump(tagger, f)

    return tagger
Ejemplo n.º 2
0
                           for (w, t) in sent] for sent in dataset2]

shuffle(traindata)
shuffle(traindata2)

regex_patterns = [
    (r"^[nN][ao]s?$", "ADP"),
    (r"^[dD][ao]s?$", "ADP"),
    (r"^[pP]el[ao]s?$", "ADP"),
    (r"^[nN]est[ae]s?$", "ADP"),
    (r"^[nN]um$", "ADP"),
    (r"^[nN]ess[ae]s?$", "ADP"),
    (r"^[nN]aquel[ae]s?$", "ADP"),
    (r"^\xe0$", "ADP"),
]

tagger = nltk.BigramTagger(traindata,
                           backoff=nltk.RegexpTagger(
                               regex_patterns,
                               backoff=nltk.UnigramTagger(
                                   traindata2,
                                   backoff=nltk.AffixTagger(
                                       traindata2,
                                       backoff=nltk.DefaultTagger('NOUN')))))
templates = nltk.brill.fntbl37()
tagger = nltk.BrillTaggerTrainer(tagger, templates)
tagger = tagger.train(traindata, max_rules=100)

with open("tagger.pkl", "wb") as f:
    pickle.dump(tagger, f)
Ejemplo n.º 3
0
cont = 0
for linia in diccionario:
    #cont+=1
    #if cont==10000:
    #    break
    linia = linia.rstrip()
    camps = linia.split(":")
    if len(camps) >= 3:
        forma = camps[0]
        lema = camps[1]
        etiqueta = camps[2]
        tupla = (forma, etiqueta)
        tagged_words.append(tupla)
tagged_sents_per_unigrams.append(tagged_words)

default_tagger = nltk.DefaultTagger("NP00000")
affix_tagger = nltk.AffixTagger(tagged_sents_per_unigrams,
                                affix_length=-3,
                                min_stem_length=2,
                                backoff=default_tagger)
unigram_tagger_diccionari = nltk.UnigramTagger(tagged_sents_per_unigrams,
                                               backoff=affix_tagger)
unigram_tagger = nltk.UnigramTagger(tagged_sents,
                                    backoff=unigram_tagger_diccionari)
bigram_tagger = nltk.BigramTagger(tagged_sents, backoff=unigram_tagger)
trigram_tagger = nltk.TrigramTagger(tagged_sents, backoff=bigram_tagger)

sortida = open('etiquetador-spa.pkl', 'wb')
pickle.dump(trigram_tagger, sortida, -1)
sortida.close()
Ejemplo n.º 4
0
import codecs
import nltk

entrada=codecs.open("diccionari-cat.txt","r",encoding="utf-8")

tagged_words=[]
tagged_sents=[]
cont=0
for linia in entrada:
    cont+=1
    if cont==10000:
        break
    linia=linia.rstrip()
    camps=linia.split(" ")
    forma=camps[0]
    lema=camps[1]
    etiqueta=camps[2]
    tupla=(forma,etiqueta)
    tagged_words.append(tupla)
tagged_sents.append(tagged_words)

affix_tagger=nltk.AffixTagger(tagged_sents, affix_length=-3, min_stem_length=2)
Ejemplo n.º 5
0
t2 = time.time()
print "tagger.pickle: unigram size %d, AO-I-PIKCELER %.1f" % (
    unigram_tagger_2.size(), t2 - t1)
bigram_tagger = nltk.BigramTagger(conll_train, backoff=unigram_tagger_2)
cPickle.dump(bigram_tagger, file('bigram.pickle', 'w'), 2)
t22 = time.time()
print "AO-I-PIKCELER: bigram size %d, AO-I-PIKCELER %.1f" % (
    bigram_tagger.size(), t22 - t2)
trigram_tagger = nltk.TrigramTagger(conll_train, backoff=bigram_tagger)
cPickle.dump(trigram_tagger, file('trigram.pickle', 'w'), 2)
t23 = time.time()
print "AO-I-PIKCELER: trigram size %d, AO-I-PIKCELER %.1f" % (
    trigram_tagger.size(), t23 - t22)
default_tagger = nltk.DefaultTagger('NN')
affix_tagger = nltk.AffixTagger(conll_train,
                                affix_length=-3,
                                min_stem_length=2,
                                backoff=default_tagger)
cPickle.dump(affix_tagger, file('affix.pickle', 'w'), 2)
t24 = time.time()
print "AO-I-PIKCELER: affix size %d, AO-I-PIKCELER %.1f" % (
    affix_tagger.size(), t24 - t23)

trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger_2,
                                 templates=templates,
                                 trace=3,
                                 deterministic=True)
t3 = time.time()
print "AO-I-PIKCELER: trainer AO-I-PIKCELER %.1f" % (t3 - t2)
tagger = trainer.train(conll_train, max_rules=10)
cPickle.dump(trainer, file('trainer.pickle', 'w'), 2)
cPickle.dump(tagger, file('tagger.pickle', 'w'), 2)
Ejemplo n.º 6
0
train_sents = brown_news_tagged[:size]
test_sents = brown_news_tagged[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)

test_tags = [
    tag for sent in brown.sents(categories='editorial')
    for (word, tag) in t2.tag(sent)
]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
print(nltk.ConfusionMatrix(gold_tags, test_tags))

affix_tagger = nltk.AffixTagger(brown_tagged_sents,
                                affix_length=2,
                                min_stem_length=0)
affix_tagger.tag(
    nltk.word_tokenize(
        'The Road to Emmaus appearance is one of the early resurrection appearances of Jesus after his crucifixion and the discovery of the empty tomb. Both the Meeting on the road to Emmaus and the subsequent Supper at Emmaus, depicting the meal that Jesus had with two disciples after the encounter on the road, have been popular subjects in art.'
    ))

date_dict = {'day': '22', 'month': 'April', 'month_num': '04', 'year': '2017'}
print('{day} of {month}, {year}.'.format(**date_dict))
print('{month_num}/{day}/{year}'.format(**date_dict))

brown_tagged_words = brown.tagged_words(categories='news')
relevant_pairs = [
    pair for pair in brown_tagged_words if pair[1] in ['NN', 'NNS']
]
fdist = nltk.FreqDist(relevant_pairs)
Ejemplo n.º 7
0
    (r"^[nN]est[ae]s?$", "ADP"),
    (r"^[nN]um$", "ADP"),
    (r"^[nN]ess[ae]s?$", "ADP"),
    (r"^[nN]aquel[ae]s?$", "ADP"),
    (r"^\xe0$", "ADP"),
]


tagger = nltk.RegexpTagger(regex_patterns,
        backoff = nltk.NgramTagger(10, traindata,
        backoff = nltk.NgramTagger(9, traindata,
        backoff = nltk.NgramTagger(8, traindata,
        backoff = nltk.NgramTagger(7, traindata,
        backoff = nltk.NgramTagger(6, traindata,
        backoff = nltk.NgramTagger(5, traindata,
        backoff = nltk.NgramTagger(4, traindata,
        backoff = nltk.NgramTagger(3, traindata,
        backoff = nltk.NgramTagger(2, traindata,
        backoff=nltk.UnigramTagger(traindata,
        backoff=nltk.AffixTagger(traindata, affix_length=-4,
        backoff=nltk.DefaultTagger("NOUN")
        ))))))))))))

templates = nltk.brill.fntbl37()
tagger = nltk.BrillTaggerTrainer(tagger, templates)
tagger = tagger.train(traindata, max_rules=100)

with open("tagger_2.pkl", "wb") as f:
    pickle.dump(tagger, f)

Ejemplo n.º 8
0
def main():
    #ploting the distribution graph
#    getDistSentByLength()
    #############################################################
    #cycle of training-testing First case - Random split 90%-10%#
    #############################################################
    train, test = stratifiedSamples([getAllTaggedCorpus()], 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Random Split= " ,ct2.evaluate(test)
    
    ###############################################################################################
    #cycle of training-testing second case - Stratified split 90%-10% according to sentence length#
    ###############################################################################################
    classes = divideToLengthClasses()
    train, test = stratifiedSamples(classes, 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Length split = " ,ct2.evaluate(test)
    
    #################################################################################################
    #cycle of training-testing Third case - Stratified split 90%-10% according to the sentence genre#
    #################################################################################################
    classes = divideToGenereClasses()
    train, test = stratifiedSamples(classes, 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Genere split = " ,ct2.evaluate(test)
Ejemplo n.º 9
0
# ☼ Learn about the affix tagger (type help(nltk.AffixTagger)). Train an affix tagger and run it on some new text. Experiment with different settings for the affix length and the minimum word length. Discuss your findings.

import nltk
from nltk.corpus import brown
from nltk.corpus import gutenberg

text = gutenberg.words('austen-persuasion.txt')

brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')
affix_tagger = nltk.AffixTagger(train=brown_tagged_sents,
                                affix_length=1,
                                min_stem_length=3)
print(affix_tagger.tag(text))
Ejemplo n.º 10
0
def train_pt_tagger(path):
    nltk.download('mac_morpho')
    nltk.download('floresta')

    def convert_to_universal_tag(t, reverse=False):
        tagdict = {
            'n': "NOUN",
            'num': "NUM",
            'v-fin': "VERB",
            'v-inf': "VERB",
            'v-ger': "VERB",
            'v-pcp': "VERB",
            'pron-det': "PRON",
            'pron-indp': "PRON",
            'pron-pers': "PRON",
            'art': "DET",
            'adv': "ADV",
            'conj-s': "CONJ",
            'conj-c': "CONJ",
            'conj-p': "CONJ",
            'adj': "ADJ",
            'ec': "PRT",
            'pp': "ADP",
            'prp': "ADP",
            'prop': "NOUN",
            'pro-ks-rel': "PRON",
            'proadj': "PRON",
            'prep': "ADP",
            'nprop': "NOUN",
            'vaux': "VERB",
            'propess': "PRON",
            'v': "VERB",
            'vp': "VERB",
            'in': "X",
            'prp-': "ADP",
            'adv-ks': "ADV",
            'dad': "NUM",
            'prosub': "PRON",
            'tel': "NUM",
            'ap': "NUM",
            'est': "NOUN",
            'cur': "X",
            'pcp': "VERB",
            'pro-ks': "PRON",
            'hor': "NUM",
            'pden': "ADV",
            'dat': "NUM",
            'kc': "ADP",
            'ks': "ADP",
            'adv-ks-rel': "ADV",
            'npro': "NOUN",
        }
        if t in ["N|AP", "N|DAD", "N|DAT", "N|HOR", "N|TEL"]:
            t = "NUM"
        if reverse:
            if "|" in t: t = t.split("|")[0]
        else:
            if "+" in t: t = t.split("+")[1]
            if "|" in t: t = t.split("|")[1]
            if "#" in t: t = t.split("#")[0]
        t = t.lower()
        return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

    floresta = [[(w, convert_to_universal_tag(t)) for (w, t) in sent]
                for sent in nltk.corpus.floresta.tagged_sents()]
    shuffle(floresta)

    mac_morpho = [[w[0] for w in sent]
                  for sent in nltk.corpus.mac_morpho.tagged_paras()]
    mac_morpho = [[(w, convert_to_universal_tag(t, reverse=True))
                   for (w, t) in sent] for sent in mac_morpho]
    shuffle(mac_morpho)

    regex_patterns = [
        (r"^[nN][ao]s?$", "ADP"),
        (r"^[dD][ao]s?$", "ADP"),
        (r"^[pP]el[ao]s?$", "ADP"),
        (r"^[nN]est[ae]s?$", "ADP"),
        (r"^[nN]um$", "ADP"),
        (r"^[nN]ess[ae]s?$", "ADP"),
        (r"^[nN]aquel[ae]s?$", "ADP"),
        (r"^\xe0$", "ADP"),
    ]

    def_tagger = nltk.DefaultTagger('NOUN')
    affix_tagger = nltk.AffixTagger(mac_morpho + floresta, backoff=def_tagger)
    unitagger = nltk.UnigramTagger(mac_morpho + floresta, backoff=affix_tagger)
    rx_tagger = nltk.RegexpTagger(regex_patterns, backoff=unitagger)
    tagger = nltk.BigramTagger(floresta, backoff=rx_tagger)
    tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37())
    tagger = tagger.train(floresta, max_rules=100)

    with open(path, "wb") as f:
        pickle.dump(tagger, f)

    return tagger
Ejemplo n.º 11
0
# -*- coding: utf-8 -*-
# 11, 13, 15, 17, 27, 34, 36, 39, 40, 43

##################
# 11 - Learn about the affix tagger (type help(nltk.AffixTagger)). Train an affix tagger and run it on some new text.
# Experiment with different settings for the affix length and the minimum word length. Discuss your findings.
##################
import nltk
from nltk.corpus import brown

tagged = brown.tagged_sents(categories='news')
gold = brown.tagged_sents(categories='hobbies')
words = brown.words(categories='hobbies')

AffTagger = nltk.AffixTagger(tagged, affix_length=-3, min_stem_length=1)
print(AffTagger.evaluate(gold))