Ejemplo n.º 1
0
 def traintest_bigram_trigram_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger 
     from nltk.corpus import treebank        
     test_sents  = treebank.tagged_sents()[3000:]          
     train_sents = treebank.tagged_sents()[:3000]
     
     print 'trainging bigramTagger'                
     bitagger = BigramTagger(train_sents)
     print 'evaluation bitagger'
     print bitagger.evaluate(test_sents)
     
     print 'trainging trigram Tagger'
     tritagger = TrigramTagger(train_sents)
     print 'evaluation bitagger'
     print tritagger.evaluate(test_sents)
     print 'tagging'
Ejemplo n.º 2
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
Ejemplo n.º 3
0
unigramTagger = UnigramTagger(training, cutoff=2)
# same as tagger.train(training)

print('Uniigram tagger accuracy:')
print(unigramTagger.evaluate(testing))

#-----------------------------------------------------

print('Bigram tagger accuracy:')

from nltk.tag import BigramTagger

bigramTagger = BigramTagger(training)

print(bigramTagger.evaluate(testing))

#-----------------------------------------------------
print('Trigram tagger accuracy:')

from nltk.tag import TrigramTagger
trigramTagger = TrigramTagger(training)

print(trigramTagger.evaluate(testing))
#-----------------------------------------------------

#Brill Tagger
from nltk.tag import brill, brill_trainer
# make sure you've got some train_sents!
#brill_tagger = train_brill_tagger(unigramTagger, training)
Ejemplo n.º 4
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Ejemplo n.º 5
0
# print "Data is splitted!"

# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'),
                             (r'.*', 'NOUN_NOM')])

# Unigram tagger
unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger)
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)

# Bigram tagger
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)

# Trigram tagger
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)

# Brill tagger templates
templates = [
    Template(brill.Pos([1, 1])),
    Template(brill.Pos([2, 2])),
    Template(brill.Pos([1, 2])),
    Template(brill.Pos([1, 3])),
    Template(brill.Word([1, 1])),
    Template(brill.Word([2, 2])),
    Template(brill.Word([1, 2])),
Ejemplo n.º 6
0
print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))

print("------------Brill Tagger------------")
print(brillTagger.tag(sent))

print("------------Accuracy: Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Bigram Tagger Trained------------")
print(bigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Trigram Tagger Trained------------")
print(trigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger with backoff enabled. Backoff Chain: UnigramTagger -> DefaultTagger------------")
unigramTagger = UnigramTagger(brown_train_sents, backoff=defaultTagger)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Tagger with backoff enabled. Backoff Chain: TrigramTagger -> BigramTagger -> UnigramTagger -> DefaultTagger------------")
print(initialTagger.evaluate(brown_test_sents))

print("------------Accuracy: Brill Tagger------------")
print(brillTagger.evaluate(brown_test_sents))
print(brillTagger.rules())
Ejemplo n.º 7
0
import nltk
from nltk.tag import BigramTagger, TrigramTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
bigramtag = BigramTagger(training)
print(bigramtag.evaluate(testing))
trigramtag = TrigramTagger(training)
print(trigramtag.evaluate(testing))
Ejemplo n.º 8
0
from nltk.tag import BigramTagger as BigT
from nltk.tag import TrigramTagger as TriT

biTagger=BigT(train_sents)
biTagger.evaluate(test_sents)

triTagger=TriT(train_sents)
triTagger.evaluate(test_sents)

Ejemplo n.º 9
0
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from tag_util import backoff_tagger

train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

bitagger = BigramTagger(train_sents)
print(bitagger.evaluate(test_sents))

tritagger = TrigramTagger(train_sents)
print(tritagger.evaluate(test_sents))

default_tagger = DefaultTagger('NN')
combined_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger)
print(combined_tagger.evaluate(test_sents))

# # train
# default_tagger = DefaultTagger('NN')
#
# train_sents = treebank.tagged_sents()[:3000]
# tagger = UnigramTagger(train_sents, backoff=default_tagger)
#
# # test
# test_sents = treebank.tagged_sents()[3000:]
# print(tagger.evaluate(test_sents))
#
# # save to pickle
# import pickle
# with open('unitagger.pkl', 'wb') as output:
#     pickle.dump(tagger, output)
Ejemplo n.º 10
0
print(rt.evaluate(test_data))

from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

#testing perfomence of unigram tagger
print(ut.evaluate(test_data))
print(ut.tag(tokens))

#testing perfomence of bigram tagger
print(bt.evaluate(test_data))
print(bt.tag(tokens))

#testing perfomence of trigram tagger
print(tt.evaluate(test_data))
print(tt.tag(tokens))


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
Ejemplo n.º 11
0
from nltk.tag import UnigramTagger
from nltk.tag import TrigramTagger
from nltk.tag import BigramTagger
from nltk.tag import DefaultTagger

# we are dividing the data into a test and train to evaluate our taggers.
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
# unigram_tagger = UnigramTagger(train_data, backoff=regexp_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.8361407355726104

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8452108043456593

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.843317053722715

# 命名实体识别
# NER tagger
from nltk import ne_chunk
from nltk import word_tokenize
sent = "Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))
print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=True))
Ejemplo n.º 12
0
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
# we are dividing the data into a test and train to evaluate our taggers.
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

#Unigram selecciona la clasificación + probable
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.UnigramTagger
unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
print("Unigram Tagger: {}".format(unigram_tagger.evaluate(test_data)))
#Bigram se basa en la palabra actual y la anterior para clasificar
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.BigramTagger
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print("Bigram Tagger: {}".format(bigram_tagger.evaluate(test_data)))
#Trigram se basa en la actual, anterior y anterior a la anterior
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.TrigramTagger
trigram_tagger = TrigramTagger(train_data,backoff=bigram_tagger)
print("Trigram Tagger: {}".format(trigram_tagger.evaluate(test_data)))

''' Aquí lo que se ha hecho ha sido crear 3 "taggeadores" N-Gram con un conjunto
    de datos de entrenamiento del corpus brown, que ya estaba clasificado.

    Además, se han podido combinar para que cuando un "taggeador" no sepa que hacer
    pruebe con su "taggeador" N-1 hasta llegar al por defecto de clasificarlo como NN.


    #######################
    ###  Regexp Tagger  ###
    #######################
Ejemplo n.º 13
0
import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1 = treebank.tagged_sents()[:7000]
bigramtagger = BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))
# print "Data is splitted!" 


# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')])

# Unigram tagger
unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger)
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)

# Bigram tagger 
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)

# Trigram tagger 
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)

# Brill tagger templates
templates = [
    Template(brill.Pos([1, 1])),
    Template(brill.Pos([2, 2])),
    Template(brill.Pos([1, 2])),
    Template(brill.Pos([1, 3])),
    Template(brill.Word([1, 1])),
    Template(brill.Word([2, 2])),
    Template(brill.Word([1, 2])),
Ejemplo n.º 15
0
def indivBigram(bambara, backoff):
    bigram= BigramTagger(bambara.train_sents, backoff=backoff)
    print("Bigram accuracy: ",bigram.evaluate(bambara.test_sents))
    return bigram
>>>default_tagger = nltk.DefaultTagger('NN')
>>>print default_tagger.evaluate(brown_tagged_sents)

# N-gram taggers

>>>from nltk.tag import UnigramTagger
>>>from nltk.tag import DefaultTagger
>>>from nltk.tag import BigramTagger
>>>from nltk.tag import TrigramTagger
# we are dividing the data into a test and train to evaluate our taggers.
>>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
>>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
>>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
>>>print unigram_tagger.evaluate(test_data)
>>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger)
>>>print bigram_tagger.evaluate(test_data)
>>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger)
>>>print trigram_tagger.evaluate(test_data)

# Regex tagger 

>>>from nltk.tag.sequential import RegexpTagger
>>>regexp_tagger = RegexpTagger(
         [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
          ( r'.*able$', 'JJ'),                # adjectives
          ( r'.*ness$', 'NN'),         # nouns formed from adj
          ( r'.*ly$', 'RB'),           # adverbs
          ( r'.*s$', 'NNS'),           # plural nouns
          ( r'.*ing$', 'VBG'),         # gerunds
          (r'.*ed$', 'VBD'),           # past tense verbs
Ejemplo n.º 17
0
accuracy = ugb_tagger.evaluate(test_sents)
print(f"Accuracy of backoff: {accuracy}\n")

# Saving pickle and testing it.
with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file:
    pickle.dump(ugb_tagger, file)

with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file:
    pk_tagger = pickle.load(file)

accuracy = pk_tagger.evaluate(test_sents)
print(f"Accuracy of pickled backoff: {accuracy}\n")

# Testing bigram and trigram taggers
bg_tagger = BigramTagger(train_sents)
accuracy = bg_tagger.evaluate(test_sents)
print(f"Accuracy of bigram: {accuracy}\n")

tg_tagger = TrigramTagger(train_sents)
accuracy = tg_tagger.evaluate(test_sents)
print(f"Accuracy of trigram: {accuracy}\n")


def make_backoffs(training, tagger_classes, backoff=None):
    """
        Function for training and make chains of backoff tagger
    """
    # Make a tagger using the previous one as a backoff
    for cls in tagger_classes:
        backoff = cls(training, backoff=backoff)
    return backoff
Ejemplo n.º 18
0
X_test = tagged_sentences[int(len(tagged_sentences) * 0.8):]
'''
Question 2 - Performance of 0.13, 0.9 and 0.91
'''

# using only the default - NN - 0.1308
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(tagged_sentences))

# Unigrams - 0.902
unigram_tagger = UnigramTagger(X_train)
print(unigram_tagger.evaluate(X_test))

# Bigrams with backoff of unigrams - 0.911
bigram_tagger = BigramTagger(X_train, backoff=unigram_tagger)
print(bigram_tagger.evaluate(X_test))
'''
Question 3 Performace of 0.77 and 0.79
'''
treebank_tagged_sents = nltk.corpus.treebank.tagged_sents(tagset='universal')
print(default_tagger.evaluate(treebank_tagged_sents))
print(unigram_tagger.evaluate(treebank_tagged_sents))  # 0.77
print(bigram_tagger.evaluate(treebank_tagged_sents))  # 0.79
'''
Question 4-5 - F1 of 0.972 for brown dataset. Better performance
'''


# modified code
def word2features(sent, i):
    word = sent[i][0]
Ejemplo n.º 19
0
print(rt.evaluate(test_data))
print(rt.tag(tokens))

# 3. N-GRAM TAGGERS:
#    Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes,
#    letters, characters or syllabes. Shingles: n-grams where items are just words.
#    UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger

# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))

print("\n1-Gram tags:")
print(ut.tag(tokens))

print("\n2-Gram tags:")
print(bt.tag(tokens))

print("\n3-Gram tags:")
print(tt.tag(tokens))

# Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams
# and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words
# do not always appear paired in the same way)
Ejemplo n.º 20
0
print(default_tagger.evaluate(brown_tagged_sents))
# 0.13089484257215028

brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]]
print(default_tagger.evaluate(brown_tagged_sents2))
# 0.3333333333333333

train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063

regexp_tagger = RegexpTagger(
    [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    ( r'(The|the|A|a|An|an)$', 'AT'), # articles
    ( r'.*able$', 'JJ'), # adjectives
    ( r'.*ness$', 'NN'), # nouns formed from adj
    ( r'.*ly$', 'RB'), # adverbs
    ( r'.*s$', 'NNS'), # plural nouns
    ( r'.*ing$', 'VBG'), # gerunds
    (r'.*ed$', 'VBD'), # past tense verbs
import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1= treebank.tagged_sents()[:7000]
bigramtagger=BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))

Ejemplo n.º 22
0
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print ct.evaluate(test_data)        
Ejemplo n.º 23
0
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from tag_util import backoff_tagger

train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

bitagger = BigramTagger(train_sents)
print(bitagger.evaluate(test_sents))

tritagger = TrigramTagger(train_sents)
print(tritagger.evaluate(test_sents))

default_tagger = DefaultTagger('NN')
combined_tagger = backoff_tagger(train_sents,
                                 [UnigramTagger, BigramTagger, TrigramTagger],
                                 backoff=default_tagger)
print(combined_tagger.evaluate(test_sents))

# # train
# default_tagger = DefaultTagger('NN')
#
# train_sents = treebank.tagged_sents()[:3000]
# tagger = UnigramTagger(train_sents, backoff=default_tagger)
#
# # test
# test_sents = treebank.tagged_sents()[3000:]
# print(tagger.evaluate(test_sents))
#
# # save to pickle
# import pickle