def train(self, sentence_list): """Trains the tagger from the tagged sentences provided """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) trigram_fallback = TrigramTagger(sentence_list, backoff=bigram_fallback) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates) self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
def test_affix_tagger(self): tagger = AffixTagger(self.corpus, backoff=self.default_tagger) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger), repr(decoded)) self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) self.assertEqual(tagger._affix_length, decoded._affix_length) self.assertEqual(tagger._min_word_length, decoded._min_word_length) self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] suffixtag = AffixTagger(training, affix_length=-3) print(suffixtag.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] affixtag = AffixTagger(training) print(affixtag.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] prefixtag = AffixTagger(training, affix_length=4) print(prefixtag.evaluate(testing))
from nltk.tag import AffixTagger from tag_util import train_sents, test_sents tagger = AffixTagger(train_sents) print(tagger.evaluate(test_sents))
train_sents = update_tags(train_sents) val_sents = update_tags(val_sents) test_sents = update_tags(test_sents) """ # ============================================================================= # finalise a sequential tagger # ============================================================================= """ """ 1. run tagger with different corpus size (50% and 100%) """ # backoff tagger tag1_eval = dict() # train with backoff and Brill tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger) tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger) tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger) tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger) tag1b_tagger = train_brill_tagger(tag1_tagger, train_sents, True, max_rules=100) tag1_eval['train_time'] = toc() # test tic() tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents)
def train(self, sentence_list): noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback)
return backoff # Testing the function with all 4 taggers bc_tagger = make_backoffs(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=df_tagger) accuracy = bc_tagger.evaluate(test_sents) print(f"Accuracy of the backoff chain tagger: {accuracy}\n") # Saving pickle with open('pickles/pos-taggers/backoff_chain_tagger.pickle', 'wb') as file: pickle.dump(bc_tagger, file) # Affix tagger: context is either the prefix or the suffix af_tagger = AffixTagger(train_sents) accuracy = af_tagger.evaluate(test_sents) print(f"Accuracy of the affix tagger: {accuracy}\n") # Brill Tagging def train_brill_tagger(initial_tagger, training, **kwargs): """ Function to train a brill tagger. Uses rules to correct the results of a tagger """ templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])),
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] prefixtag = AffixTagger(training, affix_length=4) print(prefixtag.evaluate(testing))
def train_speech_recognition(): global unigram_tagger print('Loading "{0}...'.format(corpus_path)) corpus = [] ntoken = 0 n_bad = 0 with codecs.open(corpus_path, 'r', 'utf-8') as rdr: sent = [] good = True for line0 in rdr: line = line0.strip() if len(line) == 0: if good: corpus.append(sent) ntoken += len(sent) if len(corpus) >= max_sent: break else: n_bad += 1 good = True sent = [] else: tx = line.split('\t') if len(tx) < 2: good = False else: word = tx[0].lower() pos = tx[1].lower() sent.append((word, pos)) print('done, {0} good sentences, {1} ntoken'.format(len(corpus), ntoken)) # ---------------------------------------------------------------------- n_patterns = len(corpus) n_test = int(n_patterns * 0.1) n_train = n_patterns - n_test print('n_test={0} n_train={1}'.format(n_test, n_train)) data_indeces = [x for x in range(n_patterns)] np.random.shuffle(data_indeces) test_indeces = data_indeces[:n_test] train_indeces = data_indeces[n_test:] train_corpus = [corpus[i] for i in train_indeces] test_corpus = [corpus[i] for i in test_indeces] # ---------------------------------------------------------------------- default_tagger = DefaultTagger(u'СУЩЕСТВРТЕЛЬНОЕ') # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 1-suffixes...' ) suffix1_tagger = AffixTagger(train_corpus, affix_length=-1, backoff=default_tagger) # print( 'Testing...' ) acc = suffix1_tagger.evaluate(test_corpus) # print( 'AffixTagger(1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 2-suffixes...' ) suffix2_tagger = AffixTagger(train_corpus, affix_length=-2, backoff=suffix1_tagger) # print( 'Testing...' ) acc = suffix2_tagger.evaluate(test_corpus) # print( 'AffixTagger(2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 3-suffixes...' ) suffix3_tagger = AffixTagger(train_corpus, affix_length=-3, backoff=suffix2_tagger) # print( 'Testing...' ) acc = suffix3_tagger.evaluate(test_corpus) # print( 'AffixTagger(3,2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 4,3,2-suffixes...' ) suffix4_tagger = AffixTagger(train_corpus, affix_length=-4, backoff=suffix3_tagger) # print( 'Testing...' ) acc = suffix4_tagger.evaluate(test_corpus) # print( 'AffixTagger(4,3,2) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Testing UnigramTagger + AffixTagger(4,3,2,1)...' ) unigram_tagger = UnigramTagger(train_corpus, backoff=suffix4_tagger) # print(unigram_tagger.tag(word_tokenize("погода на завтра в Одессе"))) acc = unigram_tagger.evaluate(test_corpus) # print( 'UnigramTagger+AffixTagger(4,3,2,1) accuracy={0}\n'.format(acc) ) cache_model()
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] prefixtagger = AffixTagger(training, affix_length=4) prefixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger) print(prefixtagger3.evaluate(testing)) suffixtagger3 = AffixTagger(training, affix_length=-3, backoff=prefixtagger3) print(suffixtagger3.evaluate(testing)) suffixtagger4 = AffixTagger(training, affix_length=-4, backoff=suffixtagger3) print(suffixtagger4.evaluate(testing))
def __init__(self, train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False, H_param=0): self.H_param = H_param AffixTagger.__init__(self, train, model, affix_length, min_stem_length, backoff, cutoff, verbose)
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] prefixtagger=AffixTagger(training,affix_length=4) prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger) print(prefixtagger3.evaluate(testing)) suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3) print(suffixtagger3.evaluate(testing)) suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3) print(suffixtagger4.evaluate(testing))
def indivAffix(bambara, affix_length, backoff): affix=AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=affix_length, backoff = backoff) print("Affix accuracy: ",affix.evaluate(bambara.test_sents)) return affix