class TriGramTagger(object): """ Trigram tagger """ implements(IPOSTagger) def __init__(self): self.tagger = None def train(self, sentence_list): noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback) def tag(self, words): if not self.tagger: raise Exception("Trigram Tagger not trained.") return self.tagger.tag(words)
from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print ct.evaluate(test_data) print ct.tag(tokens) from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger
def baseline_tagger(self): from nltk.corpus import brown from nltk.tag import TrigramTagger print("Number of words in Brown corpus: 1333212") print("Number of unique tags in Brown corpus: 474") f = open("input.txt", "r").read() file_info = stat("input.txt") print("Size of test file: ", file_info.st_size) sents_tokens = word_tokenize(f) print("Number of tags to be tokenized: ", len([j for i in sents_tokens for j in i])) t0 = time() tagger = TrigramTagger(brown.tagged_sents()[:55000]) t1 = time() nltk_train_time = t1 - t0 print("Time taken by NLTK for training: ", nltk_train_time) nltk_tags = [] t0 = time() for sent in sents_tokens: nltk_tags.append(tagger.tag(sent)) t1 = time() nltk_tag_time = t1 - t0 print("Time taken by NLTK to tag text: ", nltk_tag_time) t0 = time() self.tokenize() self.init_tags() self.init_words_tags() self.init_dict() self.calc_Q() self.calc_R() t1 = time() pos_train_time = t1 - t0 print("Time taken by pos_tagger to train: ", pos_train_time) pos_tagger_tags = [] t0 = time() for sent in sents_tokens: pos_tagger_tags.append(self.viterbi(sent)) t1 = time() pos_tag_time = t1 - t0 print("Time taken by pos_tagger to tag: ", pos_tag_time) if nltk_train_time < pos_train_time: print("Training time of NLTK is less than pos_tagger by: ", abs(nltk_train_time - pos_train_time)) else: print("Training time of pos_tagger is less than NLTK by: ", abs(nltk_train_time - pos_train_time)) if nltk_tag_time < pos_tag_time: print("Tagging time of NLTK is less than pos_tagger by: ", abs(nltk_tag_time - pos_tag_time)) else: print("Tagging time of pos_tagger is less than NLTK by: ", abs(nltk_tag_time - pos_tag_time)) nltk_tag_count = defaultdict(int) for i in nltk_tags: for j in i: nltk_tag_count[j[1]] += 1 pos_tag_count = defaultdict(int) for i in pos_tagger_tags: for j in i: pos_tag_count[j[1]] += 1 print("POS tags generated by NLTK: ") for i in nltk_tag_count.items(): print(i) print("POS tags generated by pos_tagger: ") for i in pos_tag_count.items(): print(i) print("Number of unique tags generated by NLTK: ", len([i for i in nltk_tag_count.keys()])) print("Number of unique tags generated by pos_tagger: ", len([i for i in pos_tag_count.keys()])) print("NLTK failed to tag", nltk_tag_count[None], "tokens") print("pos_tagger failed to tag", pos_tag_count[''], "tokens") if nltk_tag_count[None] > pos_tag_count['']: print("pos_tagger tagged", abs(nltk_tag_count[None] - pos_tag_count['']), "more tokens than NLTK") else: print("NLTK tagged", abs(nltk_tag_count[None] - pos_tag_count['']), "more tokens than pos_tagger") tagged_sents = open("input_tagged.txt", "r").read().splitlines() tags = [] for sent in tagged_sents: words = sent.split() for word in words: m = re.search('(.*)_(.*)', word) tags.append(m.group(2)) n_tags = [j[1] for i in nltk_tags for j in i] nltk_count = 0 for x, y in zip(n_tags, tags): if x == y: nltk_count += 1 len_tokens = len([j for i in sents_tokens for j in i]) print("NLTK accurately tagged", nltk_count, "tokens") print("NLTK accuracy score: ", float(nltk_count) / float(len_tokens)) p_tags = [j[1] for i in pos_tagger_tags for j in i] pos_count = 0 for x, y in zip(p_tags, tags): if x == y: pos_count += 1 print("pos_tagger accurately tagged", pos_count, "tokens") print("pos_tagger accuracy score: ", float(pos_count) / float(len_tokens)) if nltk_count > pos_count: print("NLTK accurately tagged", abs(nltk_count - pos_count), "more tokens than pos_tagger") else: print("pos_tagger accurately tagged", abs(nltk_count - pos_count), "more tokens than NLTK")
bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens)) # Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams # and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words # do not always appear paired in the same way) # 4. TAGGER CHAINING WITH BACKOFF TAGGERS: # Function to chain a set of taggers, with a backoff tagger as last resource def combined_tagger(training_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(training_data, backoff=backoff) return backoff
tt = TrigramTagger(train_data) # testing performance of unigram tagger print('unigram tagger: ') print(ut.evaluate(test_data)) print(ut.tag(nltk.word_tokenize(sentence))) # testing performance of bigram tagger print('\nbigram tagger:') print(bt.evaluate(test_data)) print(bt.tag(nltk.word_tokenize(sentence))) # testing performance of trigram tagger print('\ntrigram tagger:') print(tt.evaluate(test_data)) print(tt.tag(nltk.word_tokenize(sentence))) #%% # combined tagger with a list of taggers and use a backoff tagger def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) # evaluating the new combined tagger with backoff taggers
print(unigramTagger.tag(sent)) print("------------Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.tag(sent)) #cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger print("------------Unigram Tagger Trained with cutoff=3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff=3) print(unigramTagger.tag(sent)) print("------------Bigram Tagger------------") print(bigramTagger.tag(sent)) print("------------Trigram Tagger------------") print(trigramTagger.tag(sent)) print("------------Brill Tagger------------") print(brillTagger.tag(sent)) print("------------Accuracy: Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Bigram Tagger Trained------------") print(bigramTagger.evaluate(brown_test_sents))