Ejemplo n.º 1
0
    def __init__(self, args, model_name, load_model=False):
        super().__init__(args, model_name, load_model)
        if not load_model:
            base_tagger = HMM(args, "hmm", load_model=True)
            if not base_tagger.saved_model_exists():
                raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!")

            features = [
                Template(Pos([-1])),
                Template(Pos([1])),
                Template(Pos([-2])),
                Template(Pos([2])),
                Template(Pos([-2, -1])),
                Template(Pos([1, 2])),
                Template(Pos([-3, -2, -1])),
                Template(Pos([1, 2, 3])),
                Template(Pos([-1]), Pos([1])),
                Template(Word([-1])),
                Template(Word([1])),
                Template(Word([-2])),
                Template(Word([2])),
                Template(Word([-2, -1])),
                Template(Word([1, 2])),
                Template(Word([-3, -2, -1])),
                Template(Word([1, 2, 3])),
                Template(Word([-1]), Word([1])),
                ]
            self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)
Ejemplo n.º 2
0
def create_tagger():
    traindata = retrieve_traindata()

    tagger1 = nltk.DefaultTagger('NOUN')
    tagger2 = nltk.AffixTagger(traindata, backoff=tagger1)
    tagger3 = nltk.UnigramTagger(traindata, backoff=tagger2)
    tagger4 = nltk.RegexpTagger(patterns, backoff=tagger3)
    tagger5 = nltk.BigramTagger(traindata, backoff=tagger4)

    templates = nltk.tag.brill.fntbl37()
    tagger6 = nltk.BrillTaggerTrainer(tagger5, templates)
    tagger6 = tagger6.train(traindata)

    return tagger6
Ejemplo n.º 3
0
def train_pt_tagger(path):
    nltk.download('mac_morpho')
    nltk.download('floresta')

    def convert_to_universal_tag(t, reverse=False):
        tagdict = {
            'n': "NOUN",
            'num': "NUM",
            'v-fin': "VERB",
            'v-inf': "VERB",
            'v-ger': "VERB",
            'v-pcp': "VERB",
            'pron-det': "PRON",
            'pron-indp': "PRON",
            'pron-pers': "PRON",
            'art': "DET",
            'adv': "ADV",
            'conj-s': "CONJ",
            'conj-c': "CONJ",
            'conj-p': "CONJ",
            'adj': "ADJ",
            'ec': "PRT",
            'pp': "ADP",
            'prp': "ADP",
            'prop': "NOUN",
            'pro-ks-rel': "PRON",
            'proadj': "PRON",
            'prep': "ADP",
            'nprop': "NOUN",
            'vaux': "VERB",
            'propess': "PRON",
            'v': "VERB",
            'vp': "VERB",
            'in': "X",
            'prp-': "ADP",
            'adv-ks': "ADV",
            'dad': "NUM",
            'prosub': "PRON",
            'tel': "NUM",
            'ap': "NUM",
            'est': "NOUN",
            'cur': "X",
            'pcp': "VERB",
            'pro-ks': "PRON",
            'hor': "NUM",
            'pden': "ADV",
            'dat': "NUM",
            'kc': "ADP",
            'ks': "ADP",
            'adv-ks-rel': "ADV",
            'npro': "NOUN",
        }
        if t in ["N|AP", "N|DAD", "N|DAT", "N|HOR", "N|TEL"]:
            t = "NUM"
        if reverse:
            if "|" in t: t = t.split("|")[0]
        else:
            if "+" in t: t = t.split("+")[1]
            if "|" in t: t = t.split("|")[1]
            if "#" in t: t = t.split("#")[0]
        t = t.lower()
        return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

    floresta = [[(w, convert_to_universal_tag(t)) for (w, t) in sent]
                for sent in nltk.corpus.floresta.tagged_sents()]
    shuffle(floresta)

    mac_morpho = [[w[0] for w in sent]
                  for sent in nltk.corpus.mac_morpho.tagged_paras()]
    mac_morpho = [[(w, convert_to_universal_tag(t, reverse=True))
                   for (w, t) in sent] for sent in mac_morpho]
    shuffle(mac_morpho)

    regex_patterns = [
        (r"^[nN][ao]s?$", "ADP"),
        (r"^[dD][ao]s?$", "ADP"),
        (r"^[pP]el[ao]s?$", "ADP"),
        (r"^[nN]est[ae]s?$", "ADP"),
        (r"^[nN]um$", "ADP"),
        (r"^[nN]ess[ae]s?$", "ADP"),
        (r"^[nN]aquel[ae]s?$", "ADP"),
        (r"^\xe0$", "ADP"),
    ]

    def_tagger = nltk.DefaultTagger('NOUN')
    affix_tagger = nltk.AffixTagger(mac_morpho + floresta, backoff=def_tagger)
    unitagger = nltk.UnigramTagger(mac_morpho + floresta, backoff=affix_tagger)
    rx_tagger = nltk.RegexpTagger(regex_patterns, backoff=unitagger)
    tagger = nltk.BigramTagger(floresta, backoff=rx_tagger)
    tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37())
    tagger = tagger.train(floresta, max_rules=100)

    with open(path, "wb") as f:
        pickle.dump(tagger, f)

    return tagger
Ejemplo n.º 4
0
                           for (w, t) in sent] for sent in dataset2]

shuffle(traindata)
shuffle(traindata2)

regex_patterns = [
    (r"^[nN][ao]s?$", "ADP"),
    (r"^[dD][ao]s?$", "ADP"),
    (r"^[pP]el[ao]s?$", "ADP"),
    (r"^[nN]est[ae]s?$", "ADP"),
    (r"^[nN]um$", "ADP"),
    (r"^[nN]ess[ae]s?$", "ADP"),
    (r"^[nN]aquel[ae]s?$", "ADP"),
    (r"^\xe0$", "ADP"),
]

tagger = nltk.BigramTagger(traindata,
                           backoff=nltk.RegexpTagger(
                               regex_patterns,
                               backoff=nltk.UnigramTagger(
                                   traindata2,
                                   backoff=nltk.AffixTagger(
                                       traindata2,
                                       backoff=nltk.DefaultTagger('NOUN')))))
templates = nltk.brill.fntbl37()
tagger = nltk.BrillTaggerTrainer(tagger, templates)
tagger = tagger.train(traindata, max_rules=100)

with open("tagger.pkl", "wb") as f:
    pickle.dump(tagger, f)
Ejemplo n.º 5
0
def train_es_tagger(path):
    nltk.download('cess_esp')

    def convert_to_universal_tag(t):
        tagdict = {
            'Fa': '.',
            'Faa': '.',
            'Fat': '.',
            'Fc': '.',
            'Fd': '.',
            'Fe': '.',
            'Fg': '.',
            'Fh': '.',
            'Fi': '.',
            'Fia': '.',
            'Fit': '.',
            'Fp': '.',
            'Fpa': '.',
            'Fpt': '.',
            'Fs': '.',
            'Fx': '.',
            'Fz': '.',
            'X': 'X',
            'Y': 'X',
            'Zm': 'NUM',
            'Zp': 'NUM',
            'ao': 'ADJ',
            'ao0fp0': 'ADJ',
            'ao0fs0': 'ADJ',
            'ao0mp0': 'ADJ',
            'ao0ms0': 'ADJ',
            'aq': 'ADJ',
            'aq00000': 'ADJ',
            'aq0cn0': 'ADJ',
            'aq0cp0': 'ADJ',
            'aq0cs0': 'ADJ',
            'aq0fp0': 'ADJ',
            'aq0fpp': 'ADJ',
            'aq0fs0': 'ADJ',
            'aq0fsp': 'ADJ',
            'aq0mp0': 'ADJ',
            'aq0mpp': 'ADJ',
            'aq0ms0': 'ADJ',
            'aq0msp': 'ADJ',
            'cc': 'CONJ',
            'cs': 'CONJ',
            'da': 'DET',
            'da0fp0': 'DET',
            'da0fs0': 'DET',
            'da0mp0': 'DET',
            'da0ms0': 'DET',
            'da0ns0': 'DET',
            'dd': 'DET',
            'dd0cp0': 'DET',
            'dd0cs0': 'DET',
            'dd0fp0': 'DET',
            'dd0fs0': 'DET',
            'dd0mp0': 'DET',
            'dd0ms0': 'DET',
            'de': 'DET',
            'de0cn0': 'DET',
            'di': 'DET',
            'di0cp0': 'DET',
            'di0cs0': 'DET',
            'di0fp0': 'DET',
            'di0fs0': 'DET',
            'di0mp0': 'DET',
            'di0ms0': 'DET',
            'dn': 'DET',
            'dn0cp0': 'DET',
            'dn0cs0': 'DET',
            'dn0fp0': 'DET',
            'dn0fs0': 'DET',
            'dn0mp0': 'DET',
            'dn0ms0': 'DET',
            'dp': 'DET',
            'dp1cps': 'DET',
            'dp1css': 'DET',
            'dp1fpp': 'DET',
            'dp1fsp': 'DET',
            'dp1mpp': 'DET',
            'dp1msp': 'DET',
            'dp1mss': 'DET',
            'dp2cps': 'DET',
            'dp2css': 'DET',
            'dp3cp0': 'DET',
            'dp3cs0': 'DET',
            'dp3fs0': 'DET',
            'dp3mp0': 'DET',
            'dp3ms0': 'DET',
            'dt': 'DET',
            'dt0cn0': 'DET',
            'dt0fs0': 'DET',
            'dt0ms0': 'DET',
            'i': 'X',
            'nc': 'NOUN',
            'nc00000': 'NOUN',
            'nccn000': 'NOUN',
            'nccp000': 'NOUN',
            'nccs000': 'NOUN',
            'ncfn000': 'NOUN',
            'ncfp000': 'NOUN',
            'ncfs000': 'NOUN',
            'ncmn000': 'NOUN',
            'ncmp000': 'NOUN',
            'ncms000': 'NOUN',
            'np': 'NOUN',
            'np00000': 'NOUN',
            'np0000a': 'NOUN',
            'np0000l': 'NOUN',
            'np0000o': 'NOUN',
            'np0000p': 'NOUN',
            'p0': 'PRON',
            'p0000000': 'PRON',
            'p010p000': 'PRON',
            'p010s000': 'PRON',
            'p020s000': 'PRON',
            'p0300000': 'PRON',
            'pd': 'PRON',
            'pd0cp000': 'PRON',
            'pd0cs000': 'PRON',
            'pd0fp000': 'PRON',
            'pd0fs000': 'PRON',
            'pd0mp000': 'PRON',
            'pd0ms000': 'PRON',
            'pd0ns000': 'PRON',
            'pe': 'PRON',
            'pe000000': 'PRON',
            'pi': 'PRON',
            'pi0cp000': 'PRON',
            'pi0cs000': 'PRON',
            'pi0fp000': 'PRON',
            'pi0fs000': 'PRON',
            'pi0mp000': 'PRON',
            'pi0ms000': 'PRON',
            'pn': 'PRON',
            'pn0cp000': 'PRON',
            'pn0fp000': 'PRON',
            'pn0fs000': 'PRON',
            'pn0mp000': 'PRON',
            'pn0ms000': 'PRON',
            'pp': 'PRON',
            'pp1cp000': 'PRON',
            'pp1cs000': 'PRON',
            'pp1csn00': 'PRON',
            'pp1cso00': 'PRON',
            'pp1mp000': 'PRON',
            'pp2cp000': 'PRON',
            'pp2cp00p': 'PRON',
            'pp2cs000': 'PRON',
            'pp2cs00p': 'PRON',
            'pp2csn00': 'PRON',
            'pp2cso00': 'PRON',
            'pp3cn000': 'PRON',
            'pp3cna00': 'PRON',
            'pp3cno00': 'PRON',
            'pp3cpa00': 'PRON',
            'pp3cpd00': 'PRON',
            'pp3csa00': 'PRON',
            'pp3csd00': 'PRON',
            'pp3fp000': 'PRON',
            'pp3fpa00': 'PRON',
            'pp3fs000': 'PRON',
            'pp3fsa00': 'PRON',
            'pp3mp000': 'PRON',
            'pp3mpa00': 'PRON',
            'pp3ms000': 'PRON',
            'pp3msa00': 'PRON',
            'pp3ns000': 'PRON',
            'pr': 'PRON',
            'pr000000': 'PRON',
            'pr0cn000': 'PRON',
            'pr0cp000': 'PRON',
            'pr0cs000': 'PRON',
            'pr0fp000': 'PRON',
            'pr0fs000': 'PRON',
            'pr0mp000': 'PRON',
            'pr0ms000': 'PRON',
            'pt': 'PRON',
            'pt000000': 'PRON',
            'pt0cp000': 'PRON',
            'pt0cs000': 'PRON',
            'pt0mp000': 'PRON',
            'pt0ms000': 'PRON',
            'px': 'PRON',
            'px1fp0p0': 'PRON',
            'px1fs0p0': 'PRON',
            'px1mp0p0': 'PRON',
            'px1ms0p0': 'PRON',
            'px2fs0s0': 'PRON',
            'px3fs000': 'PRON',
            'px3mp000': 'PRON',
            'px3ms000': 'PRON',
            'px3ns000': 'PRON',
            'rg': 'ADV',
            'rn': 'ADV',
            'sn': 'ADP',
            'sn-SUJ': 'ADP',
            'sn.co-SUJ': 'ADP',
            'sn.e': 'ADP',
            'sn.e-ATR': 'ADP',
            'sn.e-CD': 'ADP',
            'sn.e-SUJ': 'ADP',
            'sn.e.1n-SUJ': 'ADP',
            'sp': 'ADP',
            'spcms': 'ADP',
            'sps00': 'ADP',
            'va': 'VERB',
            'vag0000': 'VERB',
            'vaic1p0': 'VERB',
            'vaic3p0': 'VERB',
            'vaic3s0': 'VERB',
            'vaif1p0': 'VERB',
            'vaif2s0': 'VERB',
            'vaif3p0': 'VERB',
            'vaif3s0': 'VERB',
            'vaii1p0': 'VERB',
            'vaii1s0': 'VERB',
            'vaii2s0': 'VERB',
            'vaii3p0': 'VERB',
            'vaii3s0': 'VERB',
            'vaip1p0': 'VERB',
            'vaip1s0': 'VERB',
            'vaip2p0': 'VERB',
            'vaip2s0': 'VERB',
            'vaip3p0': 'VERB',
            'vaip3s0': 'VERB',
            'vais3s0': 'VERB',
            'vam02s0': 'VERB',
            'vam03s0': 'VERB',
            'van0000': 'VERB',
            'vap00sm': 'VERB',
            'vasi1p0': 'VERB',
            'vasi1s0': 'VERB',
            'vasi3p0': 'VERB',
            'vasi3s0': 'VERB',
            'vasp1s0': 'VERB',
            'vasp3p0': 'VERB',
            'vasp3s0': 'VERB',
            'vm': 'VERB',
            'vmg0000': 'VERB',
            'vmic1p0': 'VERB',
            'vmic1s0': 'VERB',
            'vmic2s0': 'VERB',
            'vmic3p0': 'VERB',
            'vmic3s0': 'VERB',
            'vmif1p0': 'VERB',
            'vmif1s0': 'VERB',
            'vmif2s0': 'VERB',
            'vmif3p0': 'VERB',
            'vmif3s0': 'VERB',
            'vmii1p0': 'VERB',
            'vmii1s0': 'VERB',
            'vmii2p0': 'VERB',
            'vmii2s0': 'VERB',
            'vmii3p0': 'VERB',
            'vmii3s0': 'VERB',
            'vmip1p0': 'VERB',
            'vmip1s0': 'VERB',
            'vmip2p0': 'VERB',
            'vmip2s0': 'VERB',
            'vmip3p0': 'VERB',
            'vmip3s0': 'VERB',
            'vmis1p0': 'VERB',
            'vmis1s0': 'VERB',
            'vmis2s0': 'VERB',
            'vmis3p0': 'VERB',
            'vmis3s0': 'VERB',
            'vmm01p0': 'VERB',
            'vmm02s0': 'VERB',
            'vmm03p0': 'VERB',
            'vmm03s0': 'VERB',
            'vmn0000': 'VERB',
            'vmp00pf': 'VERB',
            'vmp00pm': 'VERB',
            'vmp00sf': 'VERB',
            'vmp00sm': 'VERB',
            'vmsi1p0': 'VERB',
            'vmsi1s0': 'VERB',
            'vmsi3p0': 'VERB',
            'vmsi3s0': 'VERB',
            'vmsp1p0': 'VERB',
            'vmsp1s0': 'VERB',
            'vmsp2p0': 'VERB',
            'vmsp2s0': 'VERB',
            'vmsp3p0': 'VERB',
            'vmsp3s0': 'VERB',
            'vs': 'VERB',
            'vsg0000': 'VERB',
            'vsic1s0': 'VERB',
            'vsic2s0': 'VERB',
            'vsic3p0': 'VERB',
            'vsic3s0': 'VERB',
            'vsif1s0': 'VERB',
            'vsif3p0': 'VERB',
            'vsif3s0': 'VERB',
            'vsii1p0': 'VERB',
            'vsii1s0': 'VERB',
            'vsii3p0': 'VERB',
            'vsii3s0': 'VERB',
            'vsip1p0': 'VERB',
            'vsip1s0': 'VERB',
            'vsip2s0': 'VERB',
            'vsip3p0': 'VERB',
            'vsip3s0': 'VERB',
            'vsis1s0': 'VERB',
            'vsis3p0': 'VERB',
            'vsis3s0': 'VERB',
            'vsm03s0': 'VERB',
            'vsn0000': 'VERB',
            'vsp00sm': 'VERB',
            'vssf3s0': 'VERB',
            'vssi3p0': 'VERB',
            'vssi3s0': 'VERB',
            'vssp1s0': 'VERB',
            'vssp2s0': 'VERB',
            'vssp3p0': 'VERB',
            'vssp3s0': 'VERB',
            'w': 'NOUN',
            'z': 'NUM'
        }
        t = t.lower()
        return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

    cess = [[(w, convert_to_universal_tag(t)) for (w, t) in sent]
            for sent in nltk.corpus.cess_esp.tagged_sents()]
    shuffle(cess)
    def_tagger = nltk.DefaultTagger('NOUN')
    affix_tagger = nltk.AffixTagger(cess, backoff=def_tagger)
    unitagger = nltk.UnigramTagger(cess, backoff=affix_tagger)
    tagger = nltk.BigramTagger(cess, backoff=unitagger)
    tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37())
    tagger = tagger.train(cess, max_rules=100)

    with open(path, "wb") as f:
        pickle.dump(tagger, f)

    return tagger
Ejemplo n.º 6
0
    #Template(Word([0]), Word([1]))      # current word + next word              (<0%)
    Template(Pos([-1]),
             Pos([0])),  # previous POS tag + current POS tag    (0%)
    #Template(Pos([-2]))                 # prev prev POS tag                     (<0%)
    Template(Pos([-3, -2, -1])),  # previous POS tags (disjunctive)       (<1%)
    Template(Pos([-2]),
             Pos([1])),  # previous POS tag + next POS tag       (<1%)
    Template(Pos([1])),  # next POS tag                          (<0%)
    Template(Word([-2, -1])),  # previous two words (disjunctive)      (<0.1%)
    Template(Word([0])),  # current word                          (<3%)
    Template(Word([0]), Word([-1]),
             Pos([-1]))  # current + prev word + prev POS (0%)
]

# Train a error-driven, transformation-based tagger
tt = nltk.BrillTaggerTrainer(regexp_tagger, templates, trace=3)
brill_tagger = tt.train(train_data, max_rules=MAX_RULES)

## Part 3: Evaluation
print("\nRegexp_tagger accuracy with dev_data: {}".format(
    regexp_tagger.evaluate(dev_data)))
print("Regexp_tagger accuracy with train_data: {}".format(
    regexp_tagger.evaluate(train_data)))
print("Regexp_tagger accuracy with test_data: {}".format(
    regexp_tagger.evaluate(test_data)))
print("-" * 80)
print("\nBrill_tagger accuracy with dev_data: {}".format(
    brill_tagger.evaluate(dev_data)))
print("Brill_tagger accuracy with train_data: {}".format(
    brill_tagger.evaluate(train_data)))
print("Brill with REGEX tagger accuracy with test_data: {}\n".format(