def train_classifier_tagger(self): from nltk.corpus import conll2000 from nltk.tag.sequential import ClassifierBasedPOSTagger test_sents = conll2000.tagged_sents()[9500:] train_sents = conll2000.tagged_sents() print "training class" tagger = ClassifierBasedPOSTagger(train=train_sents) #print "evaluating" #print tagger.evaluate(test_sents) #print "tag" return tagger
def kFoldCV(fold): if fold <= 1: print("please enter fold >1!") return else: l = int(len(conll2000.tagged_sents()) / fold) for i in range(fold): refreshPrint(str(fold) + ' fold cross-validation: preparing for ' + str(i + 1) + ' loop\'s tagger model...') left = l * i right = left + l testSents = conll2000.tagged_sents()[left:right] trainSents = conll2000.tagged_sents()[:left] + conll2000.tagged_sents()[right:] trainTags = set(k[1] for k in chain.from_iterable(trainSents)) | {'<s>', '<\s>'} trainWords = set([a for a, b in set(chain.from_iterable(trainSents))]) allTrainWords = list(chain.from_iterable(trainSents)) ##################trainsition Probablilities tModel = defaultdict(lambda: defaultdict(lambda: 0)) for x in trainTags: for y in trainTags: for z in trainTags: tModel[(x, y)][z] = 1 for sentence in trainSents: sentence = [('', '<s>'), ('', '<s>')] + sentence + [('', '<\s>')] for x, y, z in trigrams([k[1] for k in sentence]): tModel[(x, y)][z] += 1 for xy in tModel: totalCount = float(sum(tModel[xy].values())) for z in tModel[xy]: tModel[xy][z] /= totalCount ########################emission Probablities eModel = defaultdict(lambda: defaultdict(lambda: 0)) for x in trainTags: for z in trainWords: eModel[x][z] = 1 for z, x in allTrainWords: eModel[x][z] += 1 for x in eModel: total_count = float(sum(eModel[x].values())) for z in eModel[x]: eModel[x][z] /= total_count wrongCount = [0] refreshPrint(' ') print(str(i + 1) + ' loop accuracy: ' + str(HMMAccuracy(testSents, trainWords, trainTags, eModel, tModel, wrongCount)*100) + '%')
def getData(corpus="brown", categories=""): if corpus == "brown": if categories != "": return brown.tagged_sents(tagset='universal', categories=categories) return brown.tagged_sents(tagset='universal') elif corpus == "treebank": return treebank.tagged_sents(tagset='universal') elif corpus == "nps_chat": #Dialogue dataset data = [] posts = nps_chat.posts() words = nps_chat.tagged_words(tagset='universal') index = 0 for sent in posts: data.append(words[index:index + len(sent)]) index += len(sent) return data elif corpus == "conll2000": return conll2000.tagged_sents(tagset='universal') return brown.tagged_sents(tagset='universal')
def __init__(self): try: tagger = cPickle.load(open("nerdb_tagger.pkl")) except IOError: print "failed to load nerdb_tagger, recreating..." train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger("NN") tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open("nerdb_tagger.pkl", "w")) print "done" try: chunker = cPickle.load(open("nerdb_chunker.pkl")) except IOError: print "failed to load nerdb_chunker, recreating..." train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open("nerdb_chunker.pkl", "w")) print "done" self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()] self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()] self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()] self.entity_types = {"PERSON": self.people, "MOVIE": self.movies} self.numbers = eval(open("numbers.txt").read())
def select_sents(x): return { 'brown_universal': brown.tagged_sents(tagset='universal'), # Accuracy: 95.12% 'brown': brown.tagged_sents(), # Accuracy: 93.66% 'conll2000_universal': conll2000.tagged_sents(tagset='universal'), # Accuracy: 95.63% 'conll2000': conll2000.tagged_sents(), # Accuracy: 94.94% 'conll2002': conll2002.tagged_sents(), # Accuracy: 91.53% 'alpino': alpino.tagged_sents(), # Accuracy: 88.79% 'dependency_treebank': dependency_treebank.tagged_sents(), # Accuracy: 90.79% 'treebank': treebank.tagged_sents(), # Accuracy: 91.44% 'indian': indian.tagged_sents(), # Accuracy: 64.41% 'else': [] # in case of an unavailable corpus }.get(x, 'else')
def __init__(self): try: tagger = cPickle.load(open('nerdb_tagger.pkl')) except IOError: print 'failed to load nerdb_tagger, recreating...' train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger('NN') tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w')) print 'done' try: chunker = cPickle.load(open('nerdb_chunker.pkl')) except IOError: print 'failed to load nerdb_chunker, recreating...' train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w')) print 'done' self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()] self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()] self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()] self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
def get_noun_phrases_and_named_entities(file_name, start_index, end_index): sentences = conll2000.sents(file_name) noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP']) pos_tagged_sentences = conll2000.tagged_sents(file_name) sentences = sentences[start_index:end_index] pos_tagged_sentences = pos_tagged_sentences[start_index:end_index] noun_phrase_sentences = noun_phrase_sentences[start_index:end_index] # Extacting mentions. words = [] cnt = 0 for sent in sentences: cnt += 1 for word in sent: words.append((word, cnt)) noun_phrases = [] for sent in noun_phrase_sentences: noun_phrases += nltk.chunk.tree2conlltags(sent) named_entities = [] for tagged_sent in pos_tagged_sentences: tree = nltk.chunk.ne_chunk(tagged_sent) named_entities += nltk.chunk.tree2conlltags(tree) return (words, noun_phrases, named_entities)
def english(): from collective.classification.data.downloader import\ downloadNLTKConll2000Corpus downloadNLTKConll2000Corpus() from nltk.corpus import conll2000 conll2000_sents = conll2000.tagged_sents() tagger = BrillTrigramTagger() tagger.train(conll2000_sents) dump(tagger.tagger, "english_tagger.pickle")
def traintest_uni_bi_tri_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import conll2000, treebank test_sents = conll2000.tagged_sents()[8000:] train_sents = treebank.tagged_sents()[3000:] print 'trainging trigramter with backoff' backoff = DefaultTagger('NN') tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff) print 'evaluation trigram with backoff' print tagger.evaluate(test_sents) print 'tagging' print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
def evaluate(self): '''run tests on conll2000 and treebank data''' test = treebank.tagged_sents()[:100] treebank_result = (100*self.classifier.evaluate(test)) test = conll2000.tagged_sents()[:100] conll2000_result = (100*self.classifier.evaluate(test)) test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):] brown_result = (100*self.classifier.evaluate(test)) return (treebank_result, conll2000_result, brown_result)
def main(): # 1. a) bts = brown.tagged_sents(categories=u'news', tagset=u'universal') brown_size = int(len(bts) * 0.9) brown_training = bts[:brown_size] brown_test = bts[brown_size:] tagset = list(mapping._UNIVERSAL_TAGS) simple_tagger = a1.BigramTagger() simple_tagger.train(brown_training) #1. b) test_tagging(simple_tagger) #1. c) print u'Simple bigram tagger' print_accuracy(simple_tagger, brown_test) #1. d) print_confusion_matrix(simple_tagger, brown_test, tagset) #2. a) default_tagger = a2.DefaultTagger(u'NN') unigram_tagger = a2.UnigramTagger(backoff_tagger=default_tagger) unigram_tagger.train(brown_training) bigram_tagger = a2.BigramTagger(backoff_tagger=unigram_tagger) bigram_tagger.train(brown_training) print u'Bigram tagger with backoffs' print_accuracy(bigram_tagger, brown_test) #2. b) other_cat = brown.tagged_sents(categories='romance', tagset='universal') print u'Simple bigram tagger, other genre' print_accuracy(simple_tagger, other_cat) print u'Backoff tagger, other genre' print_accuracy(bigram_tagger, other_cat) conll_sents = conll2000.tagged_sents(tagset=u'universal') print u'Simple bigram tagger, other corpus' print_accuracy(simple_tagger, conll_sents) print u'Backoff tagger, other corpus' print_accuracy(bigram_tagger, conll_sents)
def train(self): start = time.time() templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)) ] self.train_sents = conll2000.tagged_sents('train.txt') word_patterns = [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'), ] raubt_tagger = self.backoff_tagger(self.train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns)) trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates,deterministic=True) self.tagger = trainer.train(self.train_sents,max_rules=100, min_score=3) self.save2Pickle(self.tagger) print 'Time: ' + str(time.time()-start)
from nltk.corpus import brown, conll2000, alpino, floresta, gutenberg from nltk.tag import hmm from nltk.util import unique_list from nltk.probability import * from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from collections import Counter from HMM import * # Load the Training and Test Sentences print("Downloading Training Sentences from Corpus") trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000] trainingSentences_conll2000 = conll2000.tagged_sents()[:10000] trainingSentences_alpino = alpino.tagged_sents()[:10000] trainingSentences_floresta = floresta.tagged_sents()[:10000] print "Done!" print("Downloading Test Sentences from Corpus") testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500] testSentences_conll2000 = conll2000.tagged_sents()[10000:10500] testSentences_alpino = alpino.tagged_sents()[10000:10500] testSentences_floresta = floresta.tagged_sents()[10000:10500] print "Done!" # Extracts words and tags from Sentences def extractWords_and_Tags(sentences): words = {}
nltk.data.find('corpora/conll2002') except: nltk.download('conll2002') try: nltk.data.find('taggers/averaged_perceptron_tagger') except: nltk.download('averaged_perceptron_tagger') from nltk.corpus import wordnet as wn from nltk.corpus import treebank, conll2000, brown, conll2002 from nltk import DefaultTagger, UnigramTagger, BigramTagger wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() # The code below trains bigram part of speech tagger from various datasets. train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents() edited_train = [] for sent in train_sents: edited_train.append([(word.lower(),tag) for (word,tag) in sent]) t0 = DefaultTagger(None) et1 = UnigramTagger(edited_train, backoff = t0) et2 = BigramTagger(edited_train, backoff = et1) # The function below converts bigram pos to wordnet pos for lemmatization def penn_to_wn(tag): nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV} try: return nltk_wn_pos[tag[0]] except: return None
from nltk import word_tokenize from nltk.tag import PerceptronTagger from nltk.corpus import conll2000 as cn import pickle import time train = cn.tagged_sents("train.txt") test = cn.tagged_sents("test.txt") pt = PerceptronTagger(load=False) sts=int(time.time()) pt.train(list(train),nr_iter=10) fts=int(time.time()) pts=fts-sts print pts f = open('ptagger.pickle', 'wb') pickle.dump(pt, f) f.close()
import nltk nltk.download('conll2000') from nltk.corpus import conll2000 x = (conll2000.tagged_sents()) for i in range(5): print(x[i])
ex: 'To read' output: (read: VB) which is correct output. So I need to research bit more for this. ''' from nltk.tag import RegexpTagger, untag, UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger, AffixTagger, RegexpTagger from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.corpus import brown, treebank, conll2000 from tag_util import backoff_tagger, train_brill_tagger import pickle # train_sents = brown.tagged_sents(categories=['news'])[:40000] # test_sents = brown.tagged_sents(categories=['news']) [40000:50000] train_sents = conll2000.tagged_sents() # some regex pattern that will be used for the RegexpTagger regex_pattern = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'mad', 'JJ'), (r'^a$', 'PREP')] initial_tagger = backoff_tagger( train_sents, [AffixTagger, UnigramTagger, BigramTagger, TrigramTagger], backoff=RegexpTagger(regex_pattern)) # Training the Brill Tagger brill_tagger = train_brill_tagger(initial_tagger, train_sents) #print brill_tagger.evaluate(test_sents)
import nltk from nltk.corpus import treebank from nltk.corpus import brown from nltk.corpus import nps_chat from nltk.corpus import conll2000 from nltk.corpus import ConllCorpusReader brown_fiction = list( brown.tagged_sents(categories='fiction', tagset='universal')) brown_reviews = list( brown.tagged_sents(categories='reviews', tagset='universal')) conll = list(conll2000.tagged_sents(tagset='universal')) tree = list(treebank.tagged_sents(tagset='universal')) columntypes = ['words', 'pos'] twitter_corpus = ConllCorpusReader("resources/", "twitter.conll", columntypes, tagset='en-tweet') twitter = list(twitter_corpus.tagged_sents(tagset='universal')) nps_raw = nps_chat.tagged_posts(tagset='universal') nps = [] for post in nps_raw: post_clean = [sub for sub in post if sub[0]] nps.append(post_clean)
from nltk.corpus import conll2000, brown f = open('train.txt', 'w') total = 1000 for sent in conll2000.tagged_sents(): for word, tag in sent: #f.write(word + '\t' + tag + '\n') f.write(word + '\t' + tag + '\n') total -= 1 if total == 0: break print "generated train.txt" f = open('test.txt', 'w') total = 100 for sent in conll2000.tagged_sents()[1001:1105]: for word, tag in sent: #f.write(word + '\t' + tag + '\n') f.write(word + '\t' + tag + '\n') total -= 1 if total == 0: break print "generated test.txt"
index = int(sys.argv[1]) tagset = int(sys.argv[2]) if index == 1 and tagset == 1: sents = brown.tagged_sents() elif index == 1 and tagset == 2: sents = brown.tagged_sents(tagset='universal') elif index == 2 and tagset == 1: sents = treebank.tagged_sents() elif index == 2 and tagset == 2: sents = treebank.tagged_sents(tagset='universal') elif index == 3 and tagset == 1: sents = masc_tagged.tagged_sents() elif index == 3 and tagset == 2: sents = masc_tagged.tagged_sents(tagset='universal') elif index == 4 and tagset == 1: sents = conll2000.tagged_sents() elif index == 4 and tagset == 2: sents = conll2000.tagged_sents(tagset='universal') else: print "Usage: python HMM.py <corpus_index> <tagset_index>" print "Corpus: Tagset: " print "1. brown 1. Default" print "2. treebank 2. Universal" print "3. masc_tagged" print "4. conll2000" exit(0) # Process training set def process_training_set(): # Define size of training set
# data.write("\n") # else: # data2.write("\n") # res.write("\n") # amount -= 1 # amount2 -= 1 # if amount2 == 0: # break #!/usr/bin/env python3 tagged_sentences = [] from nltk.corpus import conll2000 as corpus tagged_sentences += corpus.tagged_sents(tagset='universal') import nltk untagged_sentences += corpus.sents() data = open("testdata.txt", "wt") res = open("answer.txt", "wt") control = open("control.txt", "wt") tru_amount = len(tagged_sentences) amount = tru_amount print("[Data] Extracted {} out of {} ({:.2f}%)".format(amount, tru_amount, amount/tru_amount*100)) for sentences, untagged in zip(tagged_sentences, untagged_sentences): # if "" in [a[0] for a in sentences]:
import nltk import itertools from nltk import word_tokenize, pos_tag from nltk.corpus import brown, treebank, conll2000 ## Initialize Input Text and Corpus text = "My friend and I often enjoy working in the coffee house" #text = "I usually hang out with cute friends and watch national football league every Saturday night" corpus1 = brown.tagged_sents(tagset='universal') corpus2 = conll2000.tagged_sents(tagset='universal') corpus3 = treebank.tagged_sents(tagset='universal') corpus4 = brown.tagged_sents(categories=['news'], tagset='universal') corpus5 = brown.tagged_sents(categories=['reviews'], tagset='universal') corpus6 = brown.tagged_sents(categories=['romance'], tagset='universal') corpus = list(itertools.chain(corpus1)) ## Calculate state transition prob and word prob word_list = [] for sentence in corpus: word_list.append(("_BEGIN_", "_begin_")) word_list.extend([(tag, word) for (word, tag) in sentence]) word_list.append(("_END_", "_end_")) tags = [tag for (tag, word) in word_list] tag_list = [] for i in range(len(tags) - 1): tag_list.append([tags[i], tags[i + 1]]) # words prob word_freq = nltk.ConditionalFreqDist(word_list) B = nltk.ConditionalProbDist(word_freq, nltk.MLEProbDist) # state transition prob
from nltk.corpus import conll2000 test_sents = conll2000.chunked_sents('test.txt', chunk_types='NP') train_sents = conll2000.chunked_sents('train.txt', chunk_types='NP') # 训练用的数据格式 train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] print(train_data[0]) # 评估unigram标注器的性能 unigram_chunker = UnigramChunker(train_sents) print(unigram_chunker.evaluate(test_sents)) # 直接对parse()函数进行测试 tmp_sents = conll2000.tagged_sents('test.txt') print(tmp_sents[0]) print(unigram_chunker.parse(tmp_sents[0])) # 一元标注器对于标签的标注结果 postags = sorted( set(pos for sent in train_sents for (word, pos) in sent.leaves())) print(unigram_chunker.tagger.tag(postags)) # 试着自己建立一个二元标注器 class BigramChunker(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.BigramTagger(train_data)
corp_words_tagged = [ brown.tagged_words(tagset=CONST_tagset), nps_chat.tagged_words(tagset=CONST_tagset), conll2000.tagged_words(tagset=CONST_tagset), treebank.tagged_words(tagset=CONST_tagset) ] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ] corp_sents_untagged = [ brown.sents(), nps_chat.posts(), conll2000.sents(), treebank.sents() ] # language tool spell checker lt_check = language_check.LanguageTool('en-US') # pyenchant spell checker # pe_check = enchant.Dict('en_US')
#brown_romance_cutoff = len(brown_romance) * 2 / 3 #brown_fiction = brown.tagged_sents(categories=['fiction'],simplify_tags=True) #brown_fiction_cutoff = len(brown_fiction) * 2 / 3 #brown_belles_lettres = brown.tagged_sents(categories=['belles_lettres'],simplify_tags=True) #brown_belles_lettres_cutoff = len(brown_belles_lettres) * 2 / 3 #brown_train = list(itertools.chain(brown_reviews[:brown_reviews_cutoff], # brown_lore[:brown_lore_cutoff], brown_romance[:brown_romance_cutoff],brown_fiction[:brown_fiction_cutoff], # brown_belles_lettres[brown_belles_lettres_cutoff:])) #brown_test = list(itertools.chain(brown_reviews[brown_reviews_cutoff:], # brown_lore[brown_lore_cutoff:], brown_romance[brown_romance_cutoff:],brown_fiction[:brown_fiction_cutoff], # brown_belles_lettres[brown_belles_lettres_cutoff:])) conll_train = conll2000.tagged_sents('train.txt') conll_test = conll2000.tagged_sents('test.txt') treebank_cutoff = len(treebank.tagged_sents()) * 2 / 3 treebank_train = treebank.tagged_sents()[:treebank_cutoff] treebank_test = treebank.tagged_sents()[treebank_cutoff:] train_sents = conll_train + treebank_train test_sents = conll_test + treebank_test #train_sents = brown_train #test_sents = treebank_test #print test_sents raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.DefaultTagger('NN'))