def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def nltk_tagger(brown): tagged = [] training = nltkbrown.tagged_sents(tagset = 'universal') #create Unigram, Bigram, Trigram taggers unigram_tagger = nltk.UnigramTagger(training) bigram_tagger = nltk.BigramTagger(training) trigram_tagger = nltk.TrigramTagger(training) default_tagger = nltk.DefaultTagger('NOUN') bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) # tag sentences tagged_sentence = [] for sentence in brown: tags = trigram_tagger.tag(sentence) tagged_sentence.append(tags) for sentence in tagged_sentence: sentence = sentence[2:-1] temp = [] for tup in sentence: wordtag = tup[0] + '/' + tup[1] temp.append(wordtag) tagged.append(temp) return tagged
def exercise2(): news_tagged_sents = brown.tagged_sents(categories='news') t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(news_tagged_sents, backoff=t0) t2 = nltk.BigramTagger(news_tagged_sents, backoff=t1) t3 = nltk.TrigramTagger(news_tagged_sents, backoff=t2) news_test_sents = t3.evaluate(news_tagged_sents) print(news_test_sents) print("Part a") lore_tagged_sents = brown.tagged_sents(categories='lore') lore_tagger = t3.evaluate(lore_tagged_sents) print("Compare DefaultTagger of lore and news:", lore_tagger, news_test_sents) print("Part b") lore_size = 199 # 200th sentence lore_train_sents = lore_tagged_sents[:lore_size] lore_test_sents = lore_tagged_sents[lore_size:] unigram_tagger = nltk.UnigramTagger(lore_tagged_sents) unigram_val = unigram_tagger.evaluate(lore_tagged_sents) bigram_tagger = nltk.BigramTagger(lore_train_sents) bigram_val = bigram_tagger.evaluate(lore_test_sents) trigram_tagger = nltk.BigramTagger(lore_train_sents) trigram_val = trigram_tagger.evaluate(lore_test_sents) print(t3.tag(brown.sents(categories='lore')[199])) # print(brown.sents(categories='lore')[199]) print("Unigram", unigram_val, 'vs.Bigram', bigram_val, 'vs.Trigram', trigram_val)
def ex2(): tagged_brown = brown.tagged_sents(categories='news') results_brown = splitting(tagged_brown) train_brown1 = results_brown[0] train_brown2 = results_brown[1] test_brown1 = results_brown[2] test_brown2 = results_brown[3] tagged_chat = nps_chat.tagged_posts() results_chat = splitting(tagged_chat) train_chat1 = results_chat[0] train_chat2 = results_chat[1] test_chat1 = results_chat[2] test_chat2 = results_chat[3] default_tagger = nltk.DefaultTagger('NN') default_tagger.tag(test_brown1) default_tagger.tag(test_brown2) default_tagger.tag(test_chat1) default_tagger.tag(test_chat2) print('Test for brown corpus 1 : {}'.format( default_tagger.evaluate(test_brown1))) print('Test for brown corpus 2 : {}'.format( default_tagger.evaluate(test_brown2))) print('Test for chat corpus 1 : {}'.format( default_tagger.evaluate(test_chat1))) print('Test for chat corpus 2 : {}'.format( default_tagger.evaluate(test_chat2))) t1 = nltk.UnigramTagger(train_brown1, backoff=default_tagger) print(t1.evaluate(test_brown1)) t2 = nltk.BigramTagger(train_brown1, backoff=t1) print(t2.evaluate(test_brown1)) t3 = nltk.TrigramTagger(train_brown1, backoff=t2) print('Accuracy test brown 1: ', t3.evaluate(test_brown1)) t1 = nltk.UnigramTagger(train_brown2, backoff=default_tagger) print(t1.evaluate(test_brown2)) t2 = nltk.BigramTagger(train_brown2, backoff=t1) print(t2.evaluate(test_brown2)) t3 = nltk.TrigramTagger(train_brown2, backoff=t2) print('Accuracy test brown 2: ', t3.evaluate(test_brown2)) t1 = nltk.UnigramTagger(train_chat1, backoff=default_tagger) print(t1.evaluate(test_chat1)) t2 = nltk.BigramTagger(train_chat1, backoff=t1) print(t2.evaluate(test_chat1)) t3 = nltk.TrigramTagger(train_chat1, backoff=t2) print('Accuracy test chat 1: ', t3.evaluate(test_chat1)) t1 = nltk.UnigramTagger(train_chat2, backoff=default_tagger) print(t1.evaluate(test_chat2)) t2 = nltk.BigramTagger(train_chat2, backoff=t1) print(t2.evaluate(test_chat2)) t3 = nltk.TrigramTagger(train_chat2, backoff=t2) print('Accuracy test chat 2: ', t3.evaluate(test_chat2))
def bitagger_train(train_sents, backoff=False): if backoff == True: t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) else: t2 = nltk.BigramTagger(train_sents) return t2
def exercise2(): news_tagged_sents = brown.tagged_sents(categories='news') #brown_sents = brown.sents(categories='news') size = int(len(news_tagged_sents) * 0.9) train_sents = news_tagged_sents[:size] test_sents = news_tagged_sents[size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) print t3.evaluate(test_sents) print("Part a") lore_tagged_sents = brown.tagged_sents(categories='lore') lore_size = int(len(lore_tagged_sents) * 0.9) lore_train_sents = lore_tagged_sents[:lore_size] lore_test_sents = lore_tagged_sents[lore_size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(lore_train_sents, backoff=t0) t2 = nltk.BigramTagger(lore_train_sents, backoff=t1) t3 = nltk.TrigramTagger(lore_train_sents, backoff=t2) print "Compare DefaultTagger of lore and news:", t0.evaluate( lore_test_sents), t0.evaluate(test_sents) print "UnigramTagger val of lore", t1.evaluate(lore_test_sents) print "Compare the UnigramTagger from lore and news: ", t1.evaluate( lore_test_sents), t1.evaluate(test_sents) print "BigramTagger val of lore", t2.evaluate(lore_test_sents) print "Compare the BigramgramTagger from lore and news: ", t2.evaluate( lore_test_sents), t2.evaluate(test_sents) print "TrigramTagger val of lore", t3.evaluate(lore_test_sents) print "Compare the TrigramTagger from lore and news: ", t3.evaluate( lore_test_sents), t3.evaluate(test_sents) print("Part b") lore_size = 199 # 200th sentence lore_train_sents = lore_tagged_sents[:lore_size] lore_test_sents = lore_tagged_sents[lore_size:] unigram_tagger = nltk.UnigramTagger(lore_tagged_sents) unigram_val = unigram_tagger.evaluate(lore_tagged_sents) bigram_tagger = nltk.BigramTagger(lore_train_sents) bigram_val = bigram_tagger.evaluate(lore_test_sents) trigram_tagger = nltk.BigramTagger(lore_train_sents) trigram_val = trigram_tagger.evaluate(lore_test_sents) print(brown.sents(categories='lore')[199]) print("Unigram", unigram_val, 'vs.Bigram', bigram_val, 'vs.Trigram', trigram_val)
def tagTexto(ws): size = int(len(ws) * 0.9) train_sents = ws[:size] test_sents = ws[size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) print('BigramTagger con backoff', t2.evaluate(test_sents)) t3 = nltk.BigramTagger(train_sents) print('BigramTagger sin backoff', t3.evaluate(test_sents)) return t2
def main(): """ Main function. """ # Regular expression used as a backoff tagger regex = nltk.RegexpTagger ( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), ( r'('+'|'.join(stopwords.words('spanish')) + ')$', 'STOP' ), ( r'(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}', 'URL' ), (r'[0-9]+/[0-9]+/[0-9]+', 'DATE'), (r'([^A-Za-z0-9])+', 'PUNCT'), (r'\xbf', 'Faa'), (r'\xa1', 'Fat'), (r'.*', 'N_N') # weird tokens (default) ] ) # Create training set from the Conll2002 Spanish corpus train_set = [] for text in nltk.corpus.conll2002.tagged_sents('esp.train'): train_set.append([(word.lower(), tag) for word, tag in text]) logging.info('Training Unigram Tagger...') unigram_tagger = nltk.UnigramTagger(train_set, backoff=regex) logging.info('Training Bigram Tagger...') tagger_da = nltk.BigramTagger(train_set, backoff=unigram_tagger) logging.info('Pickling Part of Speech Tagger...') pickle.dump(tagger_da, open("tmp/pos_tagger.p", "wb"))
def train_and_save(filename, train_set, num): outfile = open(filename, 'wb') t = None if num == 1: #train a backoff t1 = nltk.UnigramTagger(train_set) t2 = nltk.BigramTagger(train_set, backoff=t1) model = {'everything': 'NN', 'max': 'NN'} t = nltk.UnigramTagger(model=model, backoff=t2) elif num == 2: t = nltk.BigramTagger(train_set) elif num == 3: t = nltk.TrigramTagger(train_set) else: return dump(t, outfile, -1) outfile.close()
def dump(config): """Loads word embeddngs an calculates neighbors. Args: config: an instance of TaggerConfiguration """ tagger_dir = config.tagger_dir tagger_name = os.path.join(tagger_dir, "tagger.pkl") os.makedirs(tagger_dir, exist_ok=True) if not os.path.isfile(tagger_name): brown_tagged_sents = brown.tagged_sents(tagset='universal') size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] t0 = nltk.DefaultTagger('X') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) scores = [[t1.evaluate(test_sents), t1], [t2.evaluate(test_sents), t2], [t3.evaluate(test_sents), t3]] best_score, best_tagger = max(scores, key=lambda x: x[0]) print("Finished building POS tagger {0:.2f}%".format(best_score * 100)) with open(tagger_name, 'wb') as f: pkl.dump(best_tagger, f) with open(tagger_name, 'rb') as f: return pkl.load(f) print("Finished saving %s and %s." % (ids_name, distances_name))
def __init__(self, train_sents): """ The constructor takes a training data set and trains the classifier """ train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.BigramTagger(train_data)
def nltk_tagger(brown_words, brown_tags, brown_dev_words): # Hint: use the following line to format data to what NLTK expects for training training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ] #training = [] #for i in xrange(len(brown_words)): # temp_training = [] # for j in xrange(len(brown_words[i])): # temp_training.append(tuple((unicode(brown_words[i][j]), unicode(brown_tags[i][j])))) # training.append(temp_training) #for train in training: # print type(train), type(train[0]) default_tagger = nltk.DefaultTagger("NOUN") bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) # IMPLEMENT THE REST OF THE FUNCTION HERE tagged = [] for words in brown_dev_words: temp_tagged = [] tagged_sentence = trigram_tagger.tag(words) #print tagged_sentence for word in tagged_sentence: temp_tagged.append(str(word[0]+'/'+str(word[1]))) temp_sentence = " ".join(temp_tagged) temp_sentence = temp_sentence + "\r\n" tagged.append(temp_sentence) #for tag in tagged: # print tag return tagged
def create_tagger(): """Train a tagger from the Brown Corpus. This should not be called very often; only in the event that the tagger pickle wasn't found.""" print("Building tagger...") train_sents = brown.tagged_sents() # These regexes were lifted from the NLTK book tagger chapter. t0 = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) print("got t0") t1 = nltk.UnigramTagger(train_sents, backoff=t0) print("got t1") t2 = nltk.BigramTagger(train_sents, backoff=t1) print("got t2") t3 = nltk.TrigramTagger(train_sents, backoff=t2) print("Built tagger!") return t3
def posTagging(self, s): """ 对一个分段进行POS标记 input: ['i','love','you'] output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')] """ brown_tagged_sents = brown.tagged_sents(tagset='universal', categories='news') default_tagger = nltk.DefaultTagger('NN') month = [ u'january', u'february', u'march', u'april', u'may', u'june', u'july', u'august', u'september', u'october', u'november', u'december' ] np_words = [w.lower() for w in names.words()] + month np_tags = dict((word, 'NP') for word in np_words) np_tagger = nltk.UnigramTagger(model=np_tags, backoff=default_tagger) brown_unigram_tagger = nltk.UnigramTagger(brown_tagged_sents, backoff=np_tagger) brown_bigram_tagger = nltk.BigramTagger(brown_tagged_sents, backoff=brown_unigram_tagger) brown_trigram_tagger = nltk.TrigramTagger(brown_tagged_sents, backoff=brown_bigram_tagger) patterns = [(r'\bi\b', 'PRON')] regexp_tagger = nltk.RegexpTagger(patterns, backoff=brown_trigram_tagger) result = regexp_tagger.tag(s) return self.encodeutf8(result)
def combined_tagging(): train_sents, test_sents = split_dataset() t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) print(t3.evaluate(test_sents))
def nltk_tagger(brown_words, brown_tags, brown_dev_words): # Hint: use the following line to format data to what NLTK expects for training training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ] # IMPLEMENT THE REST OF THE FUNCTION HERE tagged = [] #John's edit starts here unigram_tagger = nltk.DefaultTagger("NOUN") bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) # for sentence in brown_dev_words: # tri_tags = trigram_tagger.tag(sentence) tri_tags = trigram_tagger.tag_sents(brown_dev_words) for sentence in tri_tags: final_sentence_combos = [] for phrase in sentence: final_sentence_combos.append(phrase[0] + '/' + phrase[1]) tagged.append(' '.join(final_sentence_combos) + '\n') #return provided by professor return tagged
def question5(): # Provide the output of your tagger from the previous question on the 200th sentence of the lore category # of the Brown Corpus (note that brown.sents(categories='lore')[199] produces the 200th sentence). # Would you tag this sentence in the same manner? Why? # Tagged sents for news category in Brown Corpus brown_news = brown.tagged_sents(categories='news') # Tagged sents for lore category in Brown Corpus brown_lore = brown.sents(categories='lore') # Brown news as train set train_sents = brown_news # Brown lore as test set for 200th sentence test_sents = brown_lore[199] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) # Report the brown_lore 200th sentence print("brown_lore: ", t3.tag(test_sents))
def __init__(self,pathToPickle=None): """" ######## BigramPOStagger: ######## """ if pathToPickle == None: #brown = nltk.corpus.brown.tagged_sents() #nounByDefault_tagger = nltk.DefaultTagger('NN') #unigram_tagger = nltk.UnigramTagger(brown,backoff=nounByDefault_tagger) #self.bigram_tagger = nltk.BigramTagger(brown,backoff=unigram_tagger) """ NPS CHAT tagged words """ chat_words = [nltk.corpus.nps_chat.tagged_words()] nounByDefault_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(chat_words,backoff=nounByDefault_tagger) self.bigram_tagger = nltk.BigramTagger(chat_words,backoff=unigram_tagger) pickle.dump(self.bigram_tagger, open(pathToPickle,"wb")) else: self.bigram_tagger = pickle.load(open(pathToPickle))
def main(): training_data = make_corpus( load_text('text1.txt'), load_text('text2.txt'), ) test_data = load_text('text3.txt') #print words #print nltk.FreqDist(tags) #print nltk.FreqDist(tags).max() default_tagger = nltk.DefaultTagger('noun') #baseline_tagger = nltk.UnigramTagger(model=automatic_tags, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(training_data, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger) #print unigram_tagger._context_to_tag hmm = HiddenMarkovModelTrainer().train(training_data) def run_tagger(t): test = t.tag(test_data.words) print test print t.evaluate(test_data.tagged_sents) #print nltk.ConfusionMatrix(test_data.tagged_words, test) run_tagger(hmm)
def nltk_tagger(brown_words, brown_tags, brown_dev_words): tagged = [] # Thank you for this training = [ zip(brown_words[i], brown_tags[i]) for i in xrange(len(brown_words)) ] # train the nltk taggers default_tagger = nltk.DefaultTagger('NOUN') bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) for token_list in brown_dev_words: # first, tag the tokens tagged_tuples = trigram_tagger.tag(token_list) # now, format the tagged tuples into strings sent_output = [] for tag_tuple in tagged_tuples: sent_output.append(tag_tuple[0] + "/" + tag_tuple[1]) tagged.append(sent_output) return tagged
def __preparar_tagger(self): nome_arquivo_tagger = './cache/postagger.pickle' if os.path.exists(nome_arquivo_tagger): logging.debug("Carregando o Pos-Tagger já treinado de " + nome_arquivo_tagger) with open(nome_arquivo_tagger, 'rb') as arquivo: self.tagger = pickle.load(arquivo) else: logging.debug("Treinando o Pos-Tagger.") #tsents = floresta.tagged_sents() tsents = mac_morpho.tagged_sents() tsents = [[(w.lower(), self.__simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0) tagger2 = nltk.BigramTagger(tsents, backoff=tagger1) #tagger3 = nltk.PerceptronTagger(tsents) self.tagger = tagger2 logging.debug("Gravando o Pos-Tagger treinado em " + nome_arquivo_tagger) with open(nome_arquivo_tagger, 'wb') as arquivo: pickle.dump(self.tagger, arquivo)
def ngramTagger(train_sents, n=2, defaultTag='NN'): t0 = nltk.DefaultTagger(defaultTag) if (n <= 0): return t0 elif (n == 1): t1 = nltk.UnigramTagger(train_sents, backoff=t0) return t1 elif (n == 2): t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) return t2 else: t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) return t3
def tagger(self, train_set, level): """ Returns a tagger based on the level, with '0' corresponding to default tagger, '1' corresponding to a unigram tagger, '2' corresponding to a bigram tagger and '3' corresponding to a trigram tagger, with each of the previous levels as backoffs Arguments: --------- train_set (list): First 90% of the tagged sentences used for training level (int): Type of tagger to be returned - '0' corresponds to default tagger, '1' corresponds to a unigram tagger, '2' corresponds to a bigram tagger and '3' corresponds to a trigram tagger, with each of the previous levels as backoffs Returns: -------- By default, t2 (nltk.BigramTagger) Uses `nltk.UnigramTagger` and 'NN' as backoff-taggers """ t = [] while len(t) <= level: t.append(nltk.DefaultTagger(self.default_tagger)) t.append(nltk.UnigramTagger(train_set, backoff=t[0])) t.append(nltk.BigramTagger(train_set, backoff=t[1])) t.append(nltk.TrigramTagger(train_set, backoff=t[2])) return t[level]
def nltk_tagger(brown_words, brown_tags, brown_dev_words): training = [] for brown_sentence, tag_sentence in zip(brown_words, brown_tags): words = brown_sentence.split(' ') tags = tag_sentence.split(' ') sentence_tags = [] for word, tag in zip(words, tags): sentence_tags.append((word, tag)) sentence_tags.pop(0) sentence_tags.pop(0) sentence_tags.pop() sentence_tags.pop() training.append(sentence_tags) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(training, backoff=t0) t2 = nltk.BigramTagger(training, backoff=t1) t3 = nltk.TrigramTagger(training, backoff=t2) # IMPLEMENT THE REST OF THE FUNCTION HERE tagged = [] for sentence in brown_dev_words: tgd_stc = t3.tag(sentence) pairs = [] for tup in tgd_stc: word, tg = tup joint = word + '/' + tg pairs.append(joint) joint = ' '.join(pairs) tagged.append(joint + '\n') return tagged
def _train_tagger(self): training_sents = treebank.tagged_sents() patterns = [ # for regexp tagger (r'^[\.|\?|!]$', '.'), (r'^,$', ','), (r'^\'$', '\'\''), (r'^\"$', '\"'), (r'^\($', '('), (r'^\)$', ')'), (r'^[=|/]$', 'SYM'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'POS'), (r'.*s$', 'NNS'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'^[0-9][0-9]*$', 'CD'), (r'^[0-9]([0-9]*[-|.|,|/][0-9]*)*$', 'CD'), (r'^([0-9]*\.[0-9]*)*$', 'CD'), (r'^[^a-zA-Z]*$', ':'), (r'[A-Z].*', 'NNP'), (r'.*', 'NN') ] default_tagger = nltk.DefaultTagger('NN') regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger) self.final_tagger = trigram_tagger
def __init__(self): """Initialization method of :class:`TopicExtractor` class. """ # This is our fast Part of Speech tagger ############################################################################# brown_train = brown.tagged_sents(categories='news') regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN') ]) unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) ############################################################################# # This is our semi-CFG; Extend it according to your own needs ############################################################################# self.cfg = {} self.cfg["NNP+NNP"] = "NNP" self.cfg["NN+NN"] = "NNI" self.cfg["NNI+NN"] = "NNI" self.cfg["JJ+JJ"] = "JJ" self.cfg["JJ+NN"] = "NNI"
def exercise3(): # Compare the given TrigramTagger from the previous question with a TrigramTagger where no backoff is provided. # Train this tagger on all of the sentences from the Brown corpus with the category news. # Then evaluate your tagger using "evaluate" function on all of the sentences from the Brown corpus with the category lore. # Report the numbers. Which tagger performs better? Why? news_tagged_sents = brown.tagged_sents(categories='news') size = int(len(news_tagged_sents)) train_sents = news_tagged_sents[:size] test_sents = news_tagged_sents[size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents) t2 = nltk.BigramTagger(train_sents) t3 = nltk.TrigramTagger(train_sents) news_trigram_val = t3.evaluate(test_sents) print("trigram without backoff", news_trigram_val) print("Trigram with backoff ", t3.evaluate(test_sents)) # category lore lore_tagged_sents = brown.tagged_sents(categories='lore') lore_test_sents = lore_tagged_sents[size:] lore_trigram_val = t3.evaluate(lore_test_sents) print("Brown corpus category lore value", lore_trigram_val) print "Category news tagger peforms better because it evaluates tags of the same category," print "thus yielding more accurate results. It performs better if evaluate tags in the same category"
def __init__(self): #nltk.download() self.type = "text" # Code taken from 'Natural Language Processing with Python' by Steven Bird. Pg. 203 # Categorise training & test data print "Generating training & test data..." self.brown_tagged_sents = brown.tagged_sents(categories='news') # Use 90% to construct a model & 10% to test the model size = int(len(self.brown_tagged_sents) * 0.9) self.train_sents = self.brown_tagged_sents[:size] self.test_sents = self.brown_tagged_sents[size:] # Setup multiple backup taggers print "Creating taggers..." self.default_tagger = nltk.DefaultTagger('NN') self.uni_tagger = nltk.UnigramTagger(self.train_sents, backoff=self.default_tagger) self.bi_tagger = nltk.BigramTagger(self.train_sents, backoff=self.uni_tagger) self.tri_tagger = nltk.TrigramTagger(self.train_sents, backoff=self.bi_tagger) super(TextSystem, self).__init__(type)
def nltk_tagger(brown_words, brown_tags, brown_dev_words): # Hint: use the following line to format data to what NLTK expects for training #training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ] training = [] for i in xrange(len(brown_words)): words = [unicode(x, 'utf-8') for x in brown_words[i]] tags = [unicode(x, 'utf-8') for x in brown_tags[i]] training.append(zip(words, tags)) print(training[0]) #input("continue...") # IMPLEMENT THE REST OF THE FUNCTION HERE print("\nIn NLTK tagger code") default_tagger = nltk.DefaultTagger('NOUN') bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) tagged = [] for sentence in brown_dev_words: tagged_tuples = trigram_tagger.tag(sentence) tagged_sentence = "" for word_tag in tagged_tuples: tagged_sentence += word_tag[0] + "/" + word_tag[1] + " " tagged_sentence += "\n" tagged.append(tagged_sentence) Pd.printdot(1000) #monitor progress return tagged
def word_tagger(self): default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(self.training_sents, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(self.training_sents, backoff=unigram_tagger) self.text = bigram_tagger.tag(self.text)