Ejemplo n.º 1
0
class TriGramTagger(object):
    """ Trigram tagger
    """

    implements(IPOSTagger)

    def __init__(self):
        self.tagger = None

    def train(self, sentence_list):
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback)

    def tag(self, words):
        if not self.tagger:
            raise Exception("Trigram Tagger not trained.")
        return self.tagger.tag(words)
Ejemplo n.º 2
0
class TriGramTagger(object):
    """ Trigram tagger
    """

    implements(IPOSTagger)

    def __init__(self):
        self.tagger = None

    def train(self, sentence_list):
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list,
            backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list,
            backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list,
            backoff=unigram_fallback)
        self.tagger = TrigramTagger(sentence_list,
            backoff=bigram_fallback)

    def tag(self, words):
        if not self.tagger:
            raise Exception("Trigram Tagger not trained.")
        return self.tagger.tag(words)
Ejemplo n.º 3
0
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print ct.evaluate(test_data)        
print ct.tag(tokens)

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger
Ejemplo n.º 4
0
    def baseline_tagger(self):

        from nltk.corpus import brown
        from nltk.tag import TrigramTagger

        print("Number of words in Brown corpus: 1333212")
        print("Number of unique tags in Brown corpus: 474")

        f = open("input.txt", "r").read()

        file_info = stat("input.txt")

        print("Size of test file: ", file_info.st_size)

        sents_tokens = word_tokenize(f)
        print("Number of tags to be tokenized: ",
              len([j for i in sents_tokens for j in i]))

        t0 = time()
        tagger = TrigramTagger(brown.tagged_sents()[:55000])
        t1 = time()
        nltk_train_time = t1 - t0
        print("Time taken by NLTK for training: ", nltk_train_time)

        nltk_tags = []
        t0 = time()
        for sent in sents_tokens:
            nltk_tags.append(tagger.tag(sent))
        t1 = time()
        nltk_tag_time = t1 - t0
        print("Time taken by NLTK to tag text: ", nltk_tag_time)

        t0 = time()
        self.tokenize()
        self.init_tags()
        self.init_words_tags()
        self.init_dict()
        self.calc_Q()
        self.calc_R()
        t1 = time()
        pos_train_time = t1 - t0

        print("Time taken by pos_tagger to train: ", pos_train_time)

        pos_tagger_tags = []
        t0 = time()
        for sent in sents_tokens:
            pos_tagger_tags.append(self.viterbi(sent))
        t1 = time()
        pos_tag_time = t1 - t0
        print("Time taken by pos_tagger to tag: ", pos_tag_time)

        if nltk_train_time < pos_train_time:
            print("Training time of NLTK is less than pos_tagger by: ",
                  abs(nltk_train_time - pos_train_time))
        else:
            print("Training time of pos_tagger is less than NLTK by: ",
                  abs(nltk_train_time - pos_train_time))

        if nltk_tag_time < pos_tag_time:
            print("Tagging time of NLTK is less than pos_tagger by: ",
                  abs(nltk_tag_time - pos_tag_time))
        else:
            print("Tagging time of pos_tagger is less than NLTK by: ",
                  abs(nltk_tag_time - pos_tag_time))

        nltk_tag_count = defaultdict(int)
        for i in nltk_tags:
            for j in i:
                nltk_tag_count[j[1]] += 1

        pos_tag_count = defaultdict(int)
        for i in pos_tagger_tags:
            for j in i:
                pos_tag_count[j[1]] += 1

        print("POS tags generated by NLTK: ")
        for i in nltk_tag_count.items():
            print(i)

        print("POS tags generated by pos_tagger: ")
        for i in pos_tag_count.items():
            print(i)

        print("Number of unique tags generated by NLTK: ",
              len([i for i in nltk_tag_count.keys()]))

        print("Number of unique tags generated by pos_tagger: ",
              len([i for i in pos_tag_count.keys()]))

        print("NLTK failed to tag", nltk_tag_count[None], "tokens")

        print("pos_tagger failed to tag", pos_tag_count[''], "tokens")

        if nltk_tag_count[None] > pos_tag_count['']:
            print("pos_tagger tagged",
                  abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than NLTK")
        else:
            print("NLTK tagged", abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than pos_tagger")

        tagged_sents = open("input_tagged.txt", "r").read().splitlines()
        tags = []
        for sent in tagged_sents:
            words = sent.split()
            for word in words:
                m = re.search('(.*)_(.*)', word)
                tags.append(m.group(2))

        n_tags = [j[1] for i in nltk_tags for j in i]
        nltk_count = 0
        for x, y in zip(n_tags, tags):
            if x == y:
                nltk_count += 1

        len_tokens = len([j for i in sents_tokens for j in i])

        print("NLTK accurately tagged", nltk_count, "tokens")
        print("NLTK accuracy score: ", float(nltk_count) / float(len_tokens))

        p_tags = [j[1] for i in pos_tagger_tags for j in i]
        pos_count = 0
        for x, y in zip(p_tags, tags):
            if x == y:
                pos_count += 1

        print("pos_tagger accurately tagged", pos_count, "tokens")
        print("pos_tagger accuracy score: ",
              float(pos_count) / float(len_tokens))

        if nltk_count > pos_count:
            print("NLTK accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than pos_tagger")
        else:
            print("pos_tagger accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than NLTK")
Ejemplo n.º 5
0
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))

print("\n1-Gram tags:")
print(ut.tag(tokens))

print("\n2-Gram tags:")
print(bt.tag(tokens))

print("\n3-Gram tags:")
print(tt.tag(tokens))

# Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams
# and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words
# do not always appear paired in the same way)

# 4. TAGGER CHAINING WITH BACKOFF TAGGERS:


# Function to chain a set of taggers, with a backoff tagger as last resource
def combined_tagger(training_data, taggers, backoff=None):

    for tagger in taggers:
        backoff = tagger(training_data, backoff=backoff)

    return backoff
Ejemplo n.º 6
0
tt = TrigramTagger(train_data)

# testing performance of unigram tagger
print('unigram tagger: ')
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sentence)))

# testing performance of bigram tagger
print('\nbigram tagger:')
print(bt.evaluate(test_data))
print(bt.tag(nltk.word_tokenize(sentence)))

# testing performance of trigram tagger
print('\ntrigram tagger:')
print(tt.evaluate(test_data))
print(tt.tag(nltk.word_tokenize(sentence)))


#%%
# combined tagger with a list of taggers and use a backoff tagger
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

# evaluating the new combined tagger with backoff taggers
Ejemplo n.º 7
0
print(unigramTagger.tag(sent))

print("------------Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.tag(sent))

#cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))

print("------------Bigram Tagger------------")
print(bigramTagger.tag(sent))

print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))

print("------------Brill Tagger------------")
print(brillTagger.tag(sent))

print("------------Accuracy: Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Bigram Tagger Trained------------")
print(bigramTagger.evaluate(brown_test_sents))