def load_data(self, percentage): print("Started Loading the Data") # Get the complete data data_set = treebank.fileids() # Partition the data into train and test data sets training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)] testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)] # How much percentage of files consider for training? index = int(percentage*len(training_data_fileIds)) training_data_fileIds = training_data_fileIds[:index] tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds) tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds) tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds) tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds) # print(len(tagged_training_data1), len(tagged_testing_data1)) # UnTag the data for other uses untagged_training_data = [untag(item) for item in tagged_training_data] untagged_testing_data = [untag(item) for item in tagged_testing_data] print("Data Loaded Successfully. Stats are") print("Training Data Sentences: ", len(tagged_training_data)) print("Testing Data Sentences: ", len(tagged_testing_data)) return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so", "slow", "!"]) print "\n\n" print "show the 10 most informative features:" print maxent_tagger.classifier.show_most_informative_features(10)
def demo3(): from nltk.corpus import treebank, brown d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i * d10) : ((i + 1) * d10)] etest = e[(i * e10) : ((i + 1) * e10)] dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] etrain = e[: (i * e10)] + e[((i + 1) * e10) :] t.train(dtrain) s.train(etrain) tacc = t.evaluate(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.evaluate(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += tacc / tp_kn sknacc += sacc / tp_kn tallacc += tacc sallacc += sacc # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown)
def get_accuracy(self, sentences=[]): if sentences == []: test_sents = treebank.tagged_sents()[6000:] else: test_sents = sentences print self._tagger.evaluate(test_sents)
def tag_matching(sequences): treebank_sentences = treebank.tagged_sents() #treebank_sentences = brown.tagged_sents() # Return best count/sequence best = (0, None) count = 0 errors = 0 resultset = [] for seq in sequences: for sent in treebank_sentences: for i, word in enumerate(sent): if sent[i][1] == seq[0]: try: if sent[i+1][1] == seq[1]: count += 1 #if sent[i+2][1] == seq[2]: # count += 1 except IndexError: errors += 1 if count > best[0]: best = (count, seq) resultset.append((seq, count, errors)) count, erros = 0, 0 return resultset
def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = t.evaluate(d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print('Capitalization off:') print('Accuracy:', tacc) print('Percentage known:', tp_kn) print('Percentage unknown:', tp_un) print('Accuracy over known words:', (tacc / tp_kn)) sacc = s.evaluate(d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print('Capitalization on:') print('Accuracy:', sacc) print('Percentage known:', sp_kn) print('Percentage unknown:', sp_un) print('Accuracy over known words:', (sacc / sp_kn))
def main(): ### Globals ### regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) training_data = treebank.tagged_sents() unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger) bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger) unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w")) bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w")) trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w")) unigram_pickler.dump(unigram_tagger) bigram_pickler.dump(bigram_tagger) trigram_pickler.dump(trigram_tagger)
def traintest_bigram_trigram_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] print 'trainging bigramTagger' bitagger = BigramTagger(train_sents) print 'evaluation bitagger' print bitagger.evaluate(test_sents) print 'trainging trigram Tagger' tritagger = TrigramTagger(train_sents) print 'evaluation bitagger' print tritagger.evaluate(test_sents) print 'tagging'
def benchmark_aptagger(): ''' Benchmark the aptagger vs the Penn Treebank sample in nltk ''' from nltk.corpus import treebank # we want to remove "-NONE-" tags since these appear to be garbage text = [] tags = [] k = 0 for sentence in treebank.tagged_sents(): text.append([ele[0] for ele in sentence if ele[1] != '-NONE-']) tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-']) k += 1 t1 = time.time() predicted = tagger.tag_sents(text) t2 = time.time() ncorrect = sum(bool(t == p[1]) for t, p in izip(tags, chain.from_iterable(predicted))) print("For Penn Treebank sample in NLTK:") print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % ( t2 - t1, len(tags), int(len(tags) / (t2 - t1)))) print("Accuracy: %s" % (float(ncorrect) / len(tags)))
def create_input_dataset(): print 'Loading input' input_data = [] tags = [] sents = wsj.sents() json_file = open('data.json','w') counter = 0 for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') datapoint['wn'] = temp datapoint['index'] = j datapoint['i'] = counter counter += 1 if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] datapoint['tag'] = word[1] json_file.write(json.dumps(datapoint)) json_file.write('\n') input_data.append(datapoint) tags.append(word[1]) print 'Done' json_file.close() return input_data, tags
def get_pos_tagger(): train_sents = treebank.tagged_sents() tagger = nltk.TrigramTagger(train_sents, backoff= nltk.BigramTagger(train_sents, backoff= nltk.UnigramTagger(train_sents, backoff= nltk.DefaultTagger("NN")))) return tagger
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data): # train is the proportion of data used in training; the rest is reserved # for testing. if tagged_data is None: print("Loading tagged data from treebank... ") tagged_data = treebank.tagged_sents() if num_sents is None or len(tagged_data) <= num_sents: num_sents = len(tagged_data) if randomize: random.seed(len(tagged_data)) random.shuffle(tagged_data) cutoff = int(num_sents * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:num_sents] testing_data = [[t[0] for t in sent] for sent in gold_data] if not separate_baseline_data: baseline_data = training_data else: bl_cutoff = len(training_data) // 3 (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:]) (trainseqs, traintokens) = corpus_size(training_data) (testseqs, testtokens) = corpus_size(testing_data) (bltrainseqs, bltraintokens) = corpus_size(baseline_data) print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens)) print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens)) print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format( bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]")) return (training_data, baseline_data, gold_data, testing_data)
def make_sentences(): dictionary = [k.strip() for k in open("./embeddings/words.lst")] ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)} taglst = [k.strip() for k in open("data/tags.lst")] tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)} bracket_rep = { "-LRB-":"(", "-RRB-":")", "-RSB-":"[", "-RSB-":"]", "-LCB-":"{", "-RCB-":"}"} sentences = list(treebank.tagged_sents()) for i,sent in enumerate(sentences): sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-'] sent = [(bracket_rep.get(item, item), tag) for (item,tag) in sent] sent = [(u'0', tag) if item[0].isdigit() else (item,tag) for (item,tag) in sent] sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent] # 1 indexed!!! sent = [(ind_lookup[item], tag_lookup[tag]) for (item,tag) in sent] sentences[i] = sent sentences = [i for i in sentences if len(i) > 4] print(sum(map(len, sentences)) / float(len(sentences))) return sentences
def split_sents(self, train=0.95, total=3500, document_class=TaggedSentence): sents = tagged_corpus.tagged_sents()[:total] total = len(sents) if total is None else total i = int(round(train * total)) j = i + int(round(total - train * total)) return (map(document_class, sents[0:i]), map(document_class, sents[i:j]))
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] elif corpus.lower() == "floresta": from nltk.corpus import floresta tagged_sents = floresta.tagged_sents()[:num_sents] elif corpus.lower() == "cintil": print "Loading CINTIL" #column_types = ['ignore','words','ignore','ignore','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types) column_types = ['words','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types) cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types) tagged_sents = cintil.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) maxent_tagger.evaluate(test_sents) """ print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"]) print "\n\n" print "show the 40 most informative features:" print maxent_tagger.classifier.show_most_informative_features(40) """ fModel = open('test.pkl',"wb") pickle.dump(maxent_tagger, fModel,1) fModel.close()
def get_tagger(): try: with open(tagger_fn) as tagger_file: tagger = pickle.load(tagger_file) except: tagger = ClassifierBasedPOSTagger(train=treebank.tagged_sents()) with open(tagger_fn,"w") as tagger_file: pickle.dump(tagger,tagger_file) return tagger
def run(self): app = App.get_running_app() print 'start training TnT pos tagger' train_sents = treebank.tagged_sents()[:2000] unk = DefaultTagger('NN') app.root.tnt_tagger = tnt.TnT(unk=unk, Trained=True) app.root.tnt_tagger.train(train_sents) print 'end training TnT pos tagger'
def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents)
def create_dataset(): #print 'Loading dataset' dataset = [] tags = [] sents = wsj.sents() for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') #what is WN ? datapoint['wn'] = temp datapoint['index'] = j if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) #print 'Done' return dataset, tags
def train_parser(self): default_tagger = DefaultTagger("NN") train_sents = treebank.tagged_sents()[:3000] initial_tagger = self.backoff_tagger( train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger ) initial_tagger.evaluate(train_sents) brill_tagger = self.train_brill_tagger(initial_tagger, train_sents) pickle.dump(brill_tagger, open(self.pickle_path, "wb")) return brill_tagger
def make_backoff_tagger(): """ Returns a backoff tagger that useses a UnigramTagger, BigramTagger, TrigramTagger, and a Default tagger that returns NN :returns: A backoff POS tagger. """ return backoff_tagger(treebank.tagged_sents(), [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))
def train_pos_tagger(): """ Trains a POS tagger with sentences from Penn Treebank and returns it. """ train_sents = treebank.tagged_sents(simplify_tags=True) tagger = nltk.TrigramTagger(train_sents, backoff= nltk.BigramTagger(train_sents, backoff= nltk.UnigramTagger(train_sents, backoff= nltk.DefaultTagger("NN")))) return tagger
def create_dataset(): print "Loading dataset" dataset = [] tags = [] sents = wsj.sents() for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j, word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if j > 0: temp.append(sents[i][j - 1]) else: temp.append("*") if j > 1: temp.append(sents[i][j - 2]) else: temp.append("*") temp.append(sents[i][j]) if j < len_sentence - 1: temp.append(sents[i][j + 1]) else: temp.append("*") if j < len_sentence - 2: temp.append(sents[i][j + 2]) else: temp.append("*") datapoint["wn"] = temp datapoint["index"] = j if prev == None: datapoint["t_minus_one"] = "*" else: datapoint["t_minus_one"] = prev[1] if prev_prev == None: datapoint["t_minus_two"] = "*" else: datapoint["t_minus_two"] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) print "Done" return dataset, tags
def traintest_uni_bi_tri_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import conll2000, treebank test_sents = conll2000.tagged_sents()[8000:] train_sents = treebank.tagged_sents()[3000:] print 'trainging trigramter with backoff' backoff = DefaultTagger('NN') tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff) print 'evaluation trigram with backoff' print tagger.evaluate(test_sents) print 'tagging' print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
def process(data): processed_tweets = [] t0 = AffixTagger(train=treebank.tagged_sents()) t1 = UnigramTagger(train=treebank.tagged_sents(), backoff=t0) t2 = BigramTagger(train=treebank.tagged_sents(), backoff=t1) count = 0 for tweet in data.get_tweets(): count += 1 print count tweet = remove_hashtags(tweet) tweet = remove_user_tags(tweet) tweet = remove_html_entities(tweet) tweet = remove_punctuation_deep(tweet) tweet = tokenize_and_remove_stopwords(tweet) tweet = remove_apostrophes(tweet) tweet = remove_multiple_spaces(tweet) tweet = translate_slang(tweet) tweet = pos_tag_filter(tweet, data, t2) if not is_empty(tweet): processed_tweets.append(tweet) data.set_tweets(processed_tweets)
def evaluate(self): '''run tests on conll2000 and treebank data''' test = treebank.tagged_sents()[:100] treebank_result = (100*self.classifier.evaluate(test)) test = conll2000.tagged_sents()[:100] conll2000_result = (100*self.classifier.evaluate(test)) test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):] brown_result = (100*self.classifier.evaluate(test)) return (treebank_result, conll2000_result, brown_result)
def from_treebank(klass): from nltk.corpus import brown, treebank probdist = klass() for sent in treebank.tagged_sents(): for word, tag in sent: probdist.inc(word.lower(), tag) for sent in treebank_brown.tagged_sents(): for word, tag in sent: probdist.inc(word.lower(), tag) for word, tag in get_lexicon(): probdist.inc(word, tag, closed_class=False) for i in range(10): probdist.inc('can', 'VB') return probdist
def LemmatizeSents(self,sents): tagger=tagging(treebank.tagged_sents(),[UnigramTagger,BigramTagger,TrigramTagger],backoff=None) newSents=[] for sent in sents: taggedSent=tagger.tag(word_tokenize(sent)) words=[] for (wd,tg) in taggedSent: newTag=self.tagMap(tg) wd=WordNetLemmatizer().lemmatize(wd,newTag) words=words+[wd] newSent=' '.join(words) #print(newSent) newSents.append(newSent) return newSents
def __init__(self, train_set='treebank'): ''' Constructor ''' # Before building a new tagger check if one has already been pickled if (os.path.exists(os.getcwd() + '/' + _pickle_file)): input = open(_pickle_file, 'rb') self._tagger = load(input) input.close() input = open(_test_sents_pickle_file, 'rb') self._test_sents = load(input) input.close() # Primitives necessary for training the Brill tagger. # Taken from cookbook else: if train_set == 'treebank': tagged_sents = list(treebank.tagged_sents()) else: tagged_sents = list(brown.tagged_sents()) random.shuffle(tagged_sents) split_index = int(round(0.8 * len(tagged_sents))) train_sents = tagged_sents[:split_index] self._test_sents = tagged_sents[split_index:] default_tagger = DefaultTagger('NN') tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] initial_tagger = backoff_tagger(train_sents, tagger_classes, backoff=default_tagger) sym_bounds = [(1,1), (2,2), (1,2), (1,3)] asym_bounds = [(-1, -1), (1,1)] templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, *sym_bounds), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, *sym_bounds), brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds), brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)] # Train the tagger trainer = brill.FastBrillTaggerTrainer(initial_tagger, templates, deterministic=True) self._tagger = trainer.train(train_sents) #Pickle the trained tagger if not os.path.exists(os.getcwd() + '/pickles/'): os.mkdir(os.getcwd() + '/pickles/') output = open(_pickle_file, 'wb') dump(self._tagger, output, -1) output.close() output = open(_test_sents_pickle_file, 'wb') dump(self._test_sents, output, -1) output.close()
def tag_penn(words): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) """ pt_tagger = UnigramTagger(treebank.tagged_sents()) tags = pt_tagger.tag(words) return tags