def get_tagger():
    d_tagger = nltk.DefaultTagger('NN')
    re_tagger = nltk.RegexpTagger(patterns,backoff=d_tagger)
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    print("Loading tagged data... ")
    tagged_data =  brown_tagged_sents
    cutoff = int(1000*.8)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:1000]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    print("Done loading.")

    bigram_tagger = tag.BigramTagger(training_data,backoff=re_tagger)
    
    templates = [
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
      brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
      brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
      ]
    trainer = brill.FastBrillTaggerTrainer(bigram_tagger, templates, 0)
    brill_tagger = trainer.train(training_data, max_rules=100, min_score=3)

    return brill_tagger
Beispiel #2
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
Beispiel #3
0
def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs):
    bounds = [(1, end)]

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               *bounds),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               *bounds),
    ]

    trainer = brill.FastBrillTaggerTrainer(initial_tagger,
                                           templates,
                                           deterministic=True,
                                           trace=trace)
    return trainer.train(train_sents, **kwargs)
Beispiel #4
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    sym_bounds = [(1, 1), (2, 2), (1, 2), (1, 3)]
    asym_bounds = [(-1, -1), (1, 1)]

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               *sym_bounds),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               *sym_bounds),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)
    ]

    trainer = brill.FastBrillTaggerTrainer(initial_tagger,
                                           templates,
                                           deterministic=True)
    return trainer.train(train_sents, **kwargs)
Beispiel #5
0
def treina(expressao_regular,
           etiquetador=INICIAL,
           destino="BRUBT.pkl",
           raiz=".",
           codificacao="utf-8",
           max_rules=100,
           min_score=3):
    inicial = abre_etiquetador(etiquetador)
    corpus = TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao)
    train_sents = corpus.tagged_sents()
    trainer = brill.FastBrillTaggerTrainer(inicial, TEMPLATES)
    brubt = trainer.train(train_sents,
                          max_rules=max_rules,
                          min_score=min_score)
    print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO,
          brubt.tag(SENTENCA))
    f = open(destino, "wb")
    dump(brubt, f, -1)
    f.close()
Beispiel #6
0
def get_brill(train_sents=None):
    import nltk, nltk.tag
    from nltk.tag import brill

    if train_sents is None:
        train_sents = nltk.corpus.conll2000.tagged_sents(
        ) + nltk.corpus.brown.tagged_sents()

    raubt_tagger = backoff_tagger(train_sents, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
                                  backoff=nltk.tag.RegexpTagger(word_patterns))

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                      (1, 1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                      (1, 1))
    ]

    trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
    return braubt_tagger
Beispiel #7
0
        backoff = tagger_classes[0](tagged_sents)
        del tagger_classes[0]

    for cls in tagger_classes:
        tagger = cls(tagged_sents, backoff=backoff)
        backoff = tagger

    return backoff


raubt_tagger = backoff_tagger(brownc_sents, [
    nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
    nltk.tag.TrigramTagger
])

templates = [
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)),
    brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)),
    brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1))
]

trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
braubt_tagger = trainer.train(brownc_sents, max_rules=100, min_score=3)
    (r'.*ic$', 'JJ'),
    (r'.*est$', 'JJ'),
    (r'^a$', 'PREP'),
]

AO_fRAUBTtagger = AO_fBackoffTagger(
    conll_train, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
    backoff=nltk.tag.RegexpTagger(AO_sWordPatterns))

templates = [
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)),
    brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)),
    brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1))
]

trainer = brill.FastBrillTaggerTrainer(AO_fRAUBTtagger, templates)

brill_tagger = trainer.train(conll_train, max_rules=100, min_score=3)

print "AO-I-BRLTND Brill tagger trained on conll2000 corpora."