Ejemplo n.º 1
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
    def test_affix_tagger(self):
        tagger = AffixTagger(self.corpus, backoff=self.default_tagger)

        encoded = self.encoder.encode(tagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(repr(tagger), repr(decoded))
        self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
        self.assertEqual(tagger._affix_length, decoded._affix_length)
        self.assertEqual(tagger._min_word_length, decoded._min_word_length)
        self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
suffixtag = AffixTagger(training, affix_length=-3)
print(suffixtag.evaluate(testing))
Ejemplo n.º 4
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
affixtag = AffixTagger(training)
print(affixtag.evaluate(testing))
Ejemplo n.º 5
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtag = AffixTagger(training, affix_length=4)
print(prefixtag.evaluate(testing))
Ejemplo n.º 6
0
from nltk.tag import AffixTagger
from tag_util import train_sents, test_sents

tagger = AffixTagger(train_sents)
print(tagger.evaluate(test_sents))
Ejemplo n.º 7
0
train_sents = update_tags(train_sents)
val_sents = update_tags(val_sents)
test_sents = update_tags(test_sents)
"""
# =============================================================================
# finalise a sequential tagger
# =============================================================================
"""
""" 1. run tagger with different corpus size (50% and 100%) """
# backoff tagger
tag1_eval = dict()
# train with backoff and Brill
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger)
tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger)
tag1b_tagger = train_brill_tagger(tag1_tagger,
                                  train_sents,
                                  True,
                                  max_rules=100)
tag1_eval['train_time'] = toc()
# test
tic()
tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents)
Ejemplo n.º 8
0
 def train(self, sentence_list):
     noun_fallback = DefaultTagger('NN')
     affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
     unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
     bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
     self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback)
Ejemplo n.º 9
0
    return backoff


# Testing the function with all 4 taggers
bc_tagger = make_backoffs(train_sents,
                          [UnigramTagger, BigramTagger, TrigramTagger],
                          backoff=df_tagger)
accuracy = bc_tagger.evaluate(test_sents)
print(f"Accuracy of the backoff chain tagger: {accuracy}\n")

# Saving pickle
with open('pickles/pos-taggers/backoff_chain_tagger.pickle', 'wb') as file:
    pickle.dump(bc_tagger, file)

# Affix tagger: context is either the prefix or the suffix
af_tagger = AffixTagger(train_sents)
accuracy = af_tagger.evaluate(test_sents)
print(f"Accuracy of the affix tagger: {accuracy}\n")


# Brill Tagging
def train_brill_tagger(initial_tagger, training, **kwargs):
    """
        Function to train a brill tagger. Uses rules to correct the results of a tagger
    """
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
Ejemplo n.º 10
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
prefixtag = AffixTagger(training, affix_length=4)
print(prefixtag.evaluate(testing))
Ejemplo n.º 11
0
def train_speech_recognition():
    global unigram_tagger

    print('Loading "{0}...'.format(corpus_path))
    corpus = []
    ntoken = 0
    n_bad = 0
    with codecs.open(corpus_path, 'r', 'utf-8') as rdr:
        sent = []
        good = True
        for line0 in rdr:
            line = line0.strip()
            if len(line) == 0:
                if good:
                    corpus.append(sent)
                    ntoken += len(sent)
                    if len(corpus) >= max_sent:
                        break
                else:
                    n_bad += 1
                good = True
                sent = []
            else:
                tx = line.split('\t')
                if len(tx) < 2:
                    good = False
                else:
                    word = tx[0].lower()
                    pos = tx[1].lower()
                    sent.append((word, pos))

    print('done, {0} good sentences, {1} ntoken'.format(len(corpus), ntoken))
    # ----------------------------------------------------------------------

    n_patterns = len(corpus)

    n_test = int(n_patterns * 0.1)
    n_train = n_patterns - n_test
    print('n_test={0} n_train={1}'.format(n_test, n_train))
    data_indeces = [x for x in range(n_patterns)]
    np.random.shuffle(data_indeces)
    test_indeces = data_indeces[:n_test]
    train_indeces = data_indeces[n_test:]

    train_corpus = [corpus[i] for i in train_indeces]
    test_corpus = [corpus[i] for i in test_indeces]

    # ----------------------------------------------------------------------

    default_tagger = DefaultTagger(u'СУЩЕСТВИТЕЛЬНОЕ')

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 1-suffixes...' )
    suffix1_tagger = AffixTagger(train_corpus,
                                 affix_length=-1,
                                 backoff=default_tagger)
    # print( 'Testing...' )
    acc = suffix1_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 2-suffixes...' )
    suffix2_tagger = AffixTagger(train_corpus,
                                 affix_length=-2,
                                 backoff=suffix1_tagger)
    # print( 'Testing...' )
    acc = suffix2_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(2,1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 3-suffixes...' )
    suffix3_tagger = AffixTagger(train_corpus,
                                 affix_length=-3,
                                 backoff=suffix2_tagger)
    # print( 'Testing...' )
    acc = suffix3_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(3,2,1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 4,3,2-suffixes...' )
    suffix4_tagger = AffixTagger(train_corpus,
                                 affix_length=-4,
                                 backoff=suffix3_tagger)
    # print( 'Testing...' )
    acc = suffix4_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(4,3,2) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Testing UnigramTagger + AffixTagger(4,3,2,1)...' )
    unigram_tagger = UnigramTagger(train_corpus, backoff=suffix4_tagger)

    # print(unigram_tagger.tag(word_tokenize("погода на завтра в Одессе")))
    acc = unigram_tagger.evaluate(test_corpus)
    # print( 'UnigramTagger+AffixTagger(4,3,2,1) accuracy={0}\n'.format(acc) )
    cache_model()
Ejemplo n.º 12
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
prefixtagger = AffixTagger(training, affix_length=4)
prefixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger)
print(prefixtagger3.evaluate(testing))
suffixtagger3 = AffixTagger(training, affix_length=-3, backoff=prefixtagger3)
print(suffixtagger3.evaluate(testing))
suffixtagger4 = AffixTagger(training, affix_length=-4, backoff=suffixtagger3)
print(suffixtagger4.evaluate(testing))
Ejemplo n.º 13
0
 def __init__(self, train=None, model=None, affix_length=-3,
              min_stem_length=2, backoff=None, cutoff=0, verbose=False,
              H_param=0):
     self.H_param = H_param
     AffixTagger.__init__(self, train, model, affix_length,
                          min_stem_length, backoff, cutoff, verbose)
Ejemplo n.º 14
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtagger=AffixTagger(training,affix_length=4)
prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger)
print(prefixtagger3.evaluate(testing))
suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3)
print(suffixtagger3.evaluate(testing))
suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3)
print(suffixtagger4.evaluate(testing))

Ejemplo n.º 15
0
def indivAffix(bambara, affix_length, backoff):
    affix=AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=affix_length, backoff = backoff)
    print("Affix accuracy: ",affix.evaluate(bambara.test_sents))
    return affix