import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
suffixtag = AffixTagger(training, affix_length=-3)
print(suffixtag.evaluate(testing))
Ejemplo n.º 2
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtag = AffixTagger(training, affix_length=4)
print(prefixtag.evaluate(testing))
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
affixtag = AffixTagger(training)
print(affixtag.evaluate(testing))
Ejemplo n.º 4
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
prefixtag = AffixTagger(training, affix_length=4)
print(prefixtag.evaluate(testing))
Ejemplo n.º 5
0
from nltk.tag import AffixTagger
from tag_util import train_sents, test_sents

tagger = AffixTagger(train_sents)
print(tagger.evaluate(test_sents))
Ejemplo n.º 6
0

# Testing the function with all 4 taggers
bc_tagger = make_backoffs(train_sents,
                          [UnigramTagger, BigramTagger, TrigramTagger],
                          backoff=df_tagger)
accuracy = bc_tagger.evaluate(test_sents)
print(f"Accuracy of the backoff chain tagger: {accuracy}\n")

# Saving pickle
with open('pickles/pos-taggers/backoff_chain_tagger.pickle', 'wb') as file:
    pickle.dump(bc_tagger, file)

# Affix tagger: context is either the prefix or the suffix
af_tagger = AffixTagger(train_sents)
accuracy = af_tagger.evaluate(test_sents)
print(f"Accuracy of the affix tagger: {accuracy}\n")


# Brill Tagging
def train_brill_tagger(initial_tagger, training, **kwargs):
    """
        Function to train a brill tagger. Uses rules to correct the results of a tagger
    """
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
Ejemplo n.º 7
0
def train_speech_recognition():
    global unigram_tagger

    print('Loading "{0}...'.format(corpus_path))
    corpus = []
    ntoken = 0
    n_bad = 0
    with codecs.open(corpus_path, 'r', 'utf-8') as rdr:
        sent = []
        good = True
        for line0 in rdr:
            line = line0.strip()
            if len(line) == 0:
                if good:
                    corpus.append(sent)
                    ntoken += len(sent)
                    if len(corpus) >= max_sent:
                        break
                else:
                    n_bad += 1
                good = True
                sent = []
            else:
                tx = line.split('\t')
                if len(tx) < 2:
                    good = False
                else:
                    word = tx[0].lower()
                    pos = tx[1].lower()
                    sent.append((word, pos))

    print('done, {0} good sentences, {1} ntoken'.format(len(corpus), ntoken))
    # ----------------------------------------------------------------------

    n_patterns = len(corpus)

    n_test = int(n_patterns * 0.1)
    n_train = n_patterns - n_test
    print('n_test={0} n_train={1}'.format(n_test, n_train))
    data_indeces = [x for x in range(n_patterns)]
    np.random.shuffle(data_indeces)
    test_indeces = data_indeces[:n_test]
    train_indeces = data_indeces[n_test:]

    train_corpus = [corpus[i] for i in train_indeces]
    test_corpus = [corpus[i] for i in test_indeces]

    # ----------------------------------------------------------------------

    default_tagger = DefaultTagger(u'СУЩЕСТВИТЕЛЬНОЕ')

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 1-suffixes...' )
    suffix1_tagger = AffixTagger(train_corpus,
                                 affix_length=-1,
                                 backoff=default_tagger)
    # print( 'Testing...' )
    acc = suffix1_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 2-suffixes...' )
    suffix2_tagger = AffixTagger(train_corpus,
                                 affix_length=-2,
                                 backoff=suffix1_tagger)
    # print( 'Testing...' )
    acc = suffix2_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(2,1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 3-suffixes...' )
    suffix3_tagger = AffixTagger(train_corpus,
                                 affix_length=-3,
                                 backoff=suffix2_tagger)
    # print( 'Testing...' )
    acc = suffix3_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(3,2,1) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Training AffixTagger on 4,3,2-suffixes...' )
    suffix4_tagger = AffixTagger(train_corpus,
                                 affix_length=-4,
                                 backoff=suffix3_tagger)
    # print( 'Testing...' )
    acc = suffix4_tagger.evaluate(test_corpus)
    # print( 'AffixTagger(4,3,2) accuracy={0}\n'.format(acc) )

    # ----------------------------------------------------------------------

    # print( 'Testing UnigramTagger + AffixTagger(4,3,2,1)...' )
    unigram_tagger = UnigramTagger(train_corpus, backoff=suffix4_tagger)

    # print(unigram_tagger.tag(word_tokenize("погода на завтра в Одессе")))
    acc = unigram_tagger.evaluate(test_corpus)
    # print( 'UnigramTagger+AffixTagger(4,3,2,1) accuracy={0}\n'.format(acc) )
    cache_model()
Ejemplo n.º 8
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
prefixtagger = AffixTagger(training, affix_length=4)
prefixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger)
print(prefixtagger3.evaluate(testing))
suffixtagger3 = AffixTagger(training, affix_length=-3, backoff=prefixtagger3)
print(suffixtagger3.evaluate(testing))
suffixtagger4 = AffixTagger(training, affix_length=-4, backoff=suffixtagger3)
print(suffixtagger4.evaluate(testing))
Ejemplo n.º 9
0
import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtagger=AffixTagger(training,affix_length=4)
prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger)
print(prefixtagger3.evaluate(testing))
suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3)
print(suffixtagger3.evaluate(testing))
suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3)
print(suffixtagger4.evaluate(testing))

Ejemplo n.º 10
0
def indivAffix(bambara, affix_length, backoff):
    affix=AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=affix_length, backoff = backoff)
    print("Affix accuracy: ",affix.evaluate(bambara.test_sents))
    return affix