import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] suffixtag = AffixTagger(training, affix_length=-3) print(suffixtag.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] prefixtag = AffixTagger(training, affix_length=4) print(prefixtag.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] affixtag = AffixTagger(training) print(affixtag.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] prefixtag = AffixTagger(training, affix_length=4) print(prefixtag.evaluate(testing))
from nltk.tag import AffixTagger from tag_util import train_sents, test_sents tagger = AffixTagger(train_sents) print(tagger.evaluate(test_sents))
# Testing the function with all 4 taggers bc_tagger = make_backoffs(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=df_tagger) accuracy = bc_tagger.evaluate(test_sents) print(f"Accuracy of the backoff chain tagger: {accuracy}\n") # Saving pickle with open('pickles/pos-taggers/backoff_chain_tagger.pickle', 'wb') as file: pickle.dump(bc_tagger, file) # Affix tagger: context is either the prefix or the suffix af_tagger = AffixTagger(train_sents) accuracy = af_tagger.evaluate(test_sents) print(f"Accuracy of the affix tagger: {accuracy}\n") # Brill Tagging def train_brill_tagger(initial_tagger, training, **kwargs): """ Function to train a brill tagger. Uses rules to correct the results of a tagger """ templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])),
def train_speech_recognition(): global unigram_tagger print('Loading "{0}...'.format(corpus_path)) corpus = [] ntoken = 0 n_bad = 0 with codecs.open(corpus_path, 'r', 'utf-8') as rdr: sent = [] good = True for line0 in rdr: line = line0.strip() if len(line) == 0: if good: corpus.append(sent) ntoken += len(sent) if len(corpus) >= max_sent: break else: n_bad += 1 good = True sent = [] else: tx = line.split('\t') if len(tx) < 2: good = False else: word = tx[0].lower() pos = tx[1].lower() sent.append((word, pos)) print('done, {0} good sentences, {1} ntoken'.format(len(corpus), ntoken)) # ---------------------------------------------------------------------- n_patterns = len(corpus) n_test = int(n_patterns * 0.1) n_train = n_patterns - n_test print('n_test={0} n_train={1}'.format(n_test, n_train)) data_indeces = [x for x in range(n_patterns)] np.random.shuffle(data_indeces) test_indeces = data_indeces[:n_test] train_indeces = data_indeces[n_test:] train_corpus = [corpus[i] for i in train_indeces] test_corpus = [corpus[i] for i in test_indeces] # ---------------------------------------------------------------------- default_tagger = DefaultTagger(u'СУЩЕСТВРТЕЛЬНОЕ') # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 1-suffixes...' ) suffix1_tagger = AffixTagger(train_corpus, affix_length=-1, backoff=default_tagger) # print( 'Testing...' ) acc = suffix1_tagger.evaluate(test_corpus) # print( 'AffixTagger(1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 2-suffixes...' ) suffix2_tagger = AffixTagger(train_corpus, affix_length=-2, backoff=suffix1_tagger) # print( 'Testing...' ) acc = suffix2_tagger.evaluate(test_corpus) # print( 'AffixTagger(2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 3-suffixes...' ) suffix3_tagger = AffixTagger(train_corpus, affix_length=-3, backoff=suffix2_tagger) # print( 'Testing...' ) acc = suffix3_tagger.evaluate(test_corpus) # print( 'AffixTagger(3,2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 4,3,2-suffixes...' ) suffix4_tagger = AffixTagger(train_corpus, affix_length=-4, backoff=suffix3_tagger) # print( 'Testing...' ) acc = suffix4_tagger.evaluate(test_corpus) # print( 'AffixTagger(4,3,2) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Testing UnigramTagger + AffixTagger(4,3,2,1)...' ) unigram_tagger = UnigramTagger(train_corpus, backoff=suffix4_tagger) # print(unigram_tagger.tag(word_tokenize("погода на завтра в Одессе"))) acc = unigram_tagger.evaluate(test_corpus) # print( 'UnigramTagger+AffixTagger(4,3,2,1) accuracy={0}\n'.format(acc) ) cache_model()
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] prefixtagger = AffixTagger(training, affix_length=4) prefixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger) print(prefixtagger3.evaluate(testing)) suffixtagger3 = AffixTagger(training, affix_length=-3, backoff=prefixtagger3) print(suffixtagger3.evaluate(testing)) suffixtagger4 = AffixTagger(training, affix_length=-4, backoff=suffixtagger3) print(suffixtagger4.evaluate(testing))
import nltk from nltk.tag import AffixTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] prefixtagger=AffixTagger(training,affix_length=4) prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger) print(prefixtagger3.evaluate(testing)) suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3) print(suffixtagger3.evaluate(testing)) suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3) print(suffixtagger4.evaluate(testing))
def indivAffix(bambara, affix_length, backoff): affix=AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=affix_length, backoff = backoff) print("Affix accuracy: ",affix.evaluate(bambara.test_sents)) return affix