def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model.crf.tagger') print(tagger.evaluate(test_sentences)) return
def run_crf(trainfile, testfile, model_file=None): maxlen = 100 sents_train, tags_train, unique_words_train, unique_tags_train = \ P.retrieve_sentences_tags(trainfile, maxlen=maxlen) sents_test, tags_test, unique_word_test, unique_tags_test = \ P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train) train_data = [] for n, st in enumerate(sents_train): s = [] for m, _ in enumerate(st): s.append((unicode(sents_train[n][m], "utf-8") , unicode(tags_train[n][m], "utf-8"))) train_data.append(s) crf = CRFTagger() if model_file is None: crf.train(train_data, model_file='data/crf.mdl') else: crf.set_model_file(model_file) test_data = [] for n, st in enumerate(sents_test): s = [] for m, _ in enumerate(st): s.append((unicode(sents_test[n][m], "utf-8") , unicode(tags_test[n][m], "utf-8"))) test_data.append(s) print(crf.evaluate(test_data))
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model_windows_size_1.crf.tagger') #tagger = CRFTagger(feature_func=feature_func) #tagger.set_model_file('model_windows_size_1.crf.tagger') print(tagger.evaluate(test_sentences)) return
def main(): import pickle from nltk.tag import CRFTagger infolist = pickle.load(open('infolist.pickle', 'rb')) ct = CRFTagger() train_data = [[(x, z) for [x, y, z] in infolist[:round(0.9 * len(infolist))]]] ct.train(train_data, 'model.crf.tagger') ners = ct.tag_sents( [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]) print(ners) gold_sentences = [[(x, z) for [x, y, z] in infolist[round(0.9 * len(infolist)):]]] ct.evaluate(gold_sentences) print(ct.evaluate(gold_sentences))
def crf_tag(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] ct = CRFTagger() tagger = ct.train(train_sents, 'model.crf.tagger') test = ct.evaluate(test_sents) print test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode( 'utf-8') sent_w = sent3.lower().split() print sent_w tag = ct.tag(sent_w) print "The Tag Is:", tag
def ontweetdata(): tweetinfolist = pickle.load(open('tweetinfolist.pickle', 'rb')) #data from tweets counter = 0 for item in tweetinfolist: if item[1] == "O": counter = counter + 1 print("BASELINE: ", (counter) / len(tweetinfolist)) ct = CRFTagger() train_data = [[ (x.lower(), y.lower()) for [x, y] in tweetinfolist[round(0.9 * len(tweetinfolist)):] ]] ct.set_model_file('model.crf.tagger') ct.train(train_data, 'model.crf.tagger') gold_sentences = [[ (x.lower(), y.lower()) for [x, y] in tweetinfolist[:round(0.9 * len(tweetinfolist))] ]] print(ct.evaluate(gold_sentences))
def oninfolist(): """NU DOEN: KIJK NAAR FORMAT VAN GEGEVEN INFORMATIE OP INTERNET IN VOORBEELD, CHECK ALLE LIJSTEN DIE IK GEMAAKT HEB OF ZE OVEREENKOMEN MET DE VORM""" #SEE: http://www.nltk.org/_modules/nltk/tag/crf.html infolist = pickle.load(open('sentencelist.pickle', 'rb')) limit = round(len(infolist) * 0.4) train_data = infolist[0:limit] #print("train_data = ", train_data[0:10]) ct = CRFTagger() #print(infolist[0:10]) realsentences = [] realsentence = "" """ for sentence in infolist[limit:]: for (word,nertag) in sentence: realsentence = realsentence +" "+ word realsentences.append(realsentence) realsentence = "" pickle.dump(realsentences,open("realsentences.pickle","wb")) print("pickle-bestand gemaakt") """ realsentences = pickle.load(open("realsentences.pickle", "rb")) print("REALSENTENCES:", realsentences[0:10]) splitsentences = [] #[['dog','is','good'],['cat','eat','meat']] for r in realsentences: splitsentence = r.split() splitsentences.append(splitsentence) #print("train_data:", infolist[0:10]) #print("sentences for tag_sents:", splitsentences[0:10]) ct.tag_sents(splitsentences[limit:]) gold_sentences = infolist[limit:] print("GOLD SENTENCES:", infolist[10:20]) print(ct.evaluate(gold_sentences))
from nltk.tag import CRFTagger ct = CRFTagger() train_data = [[('Universiteit', 'Noun'), ('is', 'Verb'), ('een', 'Det'), ('goed', 'Adj'), ('goede', 'Adj'), ('plek', 'Noun'), ('hond', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]] ct.train(train_data, 'model.crf.tagger') ct.tag_sents([['hond', 'is', 'goed'], ['kat', 'eet', 'vlees']]) gold_sentences = [[('hond', 'Noun'), ('is', 'Verb'), ('goed', 'Adj')], [('kat', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]] ct.evaluate(gold_sentences) ct = CRFTagger() ct.set_model_file('model.crf.tagger') print(ct.evaluate(gold_sentences))
class DataAdapter(object): def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data) # print('TAGGING', self.data_tagging) # print('TESTING', self.data_testing) def tokenize_tag(self, text): text = text.replace('\r', ' | ').replace('\n', ' | ') tokens = word_tokenize(text, preserve_line=True) labels = [] for label in self.tag(tokens): labels.append(label[1]) return tokens, labels def for_tagging_testing(self, data): # self.data = data array_tagging = [] array_testing = [] for d in data: all_tags = [] all_test = [] for index, t in enumerate(d['text']): one_tag = [t, d['label'][index]] all_test.append(one_tag) all_tags.append(t) array_tagging.append(all_tags) array_testing.append(all_test) # print(all_tags) return array_tagging, array_testing def for_testing(self, data): # self.data = data array = [] # print('TEST', data.count()) for d in data: all_tags = [] for index, t in enumerate(d['text']): # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')] one_tag = [t, d['label'][index]] all_tags.append(one_tag) array.append(all_tags) # print(all_tags) return array def for_tagging(self, data): # self.data = data array = [] for d in data: all_tags = [] for t in d['text']: all_tags.append(t) array.append(all_tags) # print(all_tags) return array def tag_sents(self): if self.data_tagging is not None: return self.tagger.tag_sents(self.data_tagging) else: return 'NoData' def tag(self, data): return self.tagger.tag(data) def evaluate(self): if self.data_testing is not None: return self.tagger.evaluate(self.data_testing) else: return 'NoData' def train(self, data): data = self.for_testing(data) self.tagger.train(data, 'model.crf.tagger') print('ACCURACY:', self.tagger.evaluate(data))
from nltk.corpus import treebank from nltk.tag import tnt, CRFTagger # split training data from test data train_data = treebank.tagged_sents()[:3000] test_data = treebank.tagged_sents()[3000:] # train a trigram N tagger (TnT) tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) print tnt_pos_tagger.evaluate(test_data) # train a CRF tagger crf_tagger = CRFTagger() crf_tagger.train(train_data, '~/Documents/NLP/NLP/crf_model.txt') print crf_tagger.evaluate(test_data)
# -*- coding: utf-8 -*- from nltk.tag import CRFTagger import re import codecs def get_data(): with codecs.open('th_pud-ud-test.conllu', 'r',encoding='utf8') as f: lines = f.read() return re.split("#(.*)+[\r\n]#(.*)+[\r\n]",lines) data=get_data() i=0 data_all=[] while i<len(data): data_list=[] for r in re.split('\n',data[i]): t=[x for x in re.split('\t',r) if x!=''] if t!=[]: data_list.append((t[1],t[3])) data_all.append(data_list) i+=1 train_data=[x for x in data_all if x!=[]] ct = CRFTagger() ct.set_model_file('model.crf.tagger') print(ct.evaluate(train_data))
# In[ ]: tagger = BigramTagger(train_reducido[:1000]) tagger.evaluate(test_reducido[:1000]) entrenar_bill(tagger,"BigramTagger") # In[ ]: ct = CRFTagger() ct.train(train_reducido[:1000],'model.crf.tagger') evaluacion = ct.evaluate(test_reducido[:1000]) xlabels.append("CRF Tagger") accuracys.append(evaluacion) # In[ ]: tagger = PerceptronTagger(load=False) tagger.train(train_reducido[:1000]) evaluacion = tagger.evaluate(test_reducido[:1000]) xlabels.append("Perceptron Tagger") accuracys.append(evaluacion) # In[ ]:
language = page.find('language').text.decode('utf8') pos = page.find('pos_tags').text.decode('utf8') splitText = text.split(" ")[1:-1] posText = pos.split(" ")[1:-1] for i in range(len(splitText)): l.append((splitText[i], posText[i])) data.append(l) count = count + 1 shuffle(data) print len(data) # Divide data into train and test sets eightyPercent = count * 0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] print training_set # Train ct = CRFTagger() train_data = training_set ct.train(train_data, 'model.crf.tagger') # Accuracy gold_sentences = test_set print ct.evaluate(gold_sentences) print "Give a sentence..." # Test test_sent = raw_input() test_sent = test_sent.encode('utf-8').decode('utf-8').split(' ') # print test_sent print ct.tag_sents([test_sent])
print "\nReading training corpus...." ListOfSentences_Training = corpusRead(Training_Data) print "Reading test corpus...." ListOfSentences_Test = corpusRead(Test_Data) #CRF Training ct = CRFTagger() print "CRF Training starts..." ct.train(ListOfSentences_Training,'model.crf.tagger') print "CRF Training is done." print "Testing starts" print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100 #Tagging by CRF Tagger ch = 'y' while (ch != 'n'): text = raw_input("Enter the text to be tagged : \n") text = converter(text) print ct.tag_sents(text) print "\nDo you want to continue ?" ch = raw_input() #HMM Training print "HMM Training using HiddenMarkovModelTrainer() starts.."
from nltk.tag import CRFTagger from nltk.corpus import brown tagged_sents = brown.tagged_sents() train = tagged_sents[:50000] test = tagged_sents[50000:] crf = CRFTagger() crf.train(train, 'crf_tagger.model') a = crf.evaluate(test) print a
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
line=f.readline() f.close() res = ct.tag_sents(test_sentences) tagged_result = [] tagged_actual = [] for i in range(len(res)): for j in range(len(res[i])): tagged_result.append(res[i][j][1]) tagged_actual.append(test_actual[i][j][1]) print res[0] print test_actual[0] #print tagged_result[0] #print tagged_actual[0] gold_sentences=test_actual accuracy = ct.evaluate(gold_sentences) print "accuracy:"+str(accuracy) #recall = nltk.metrics.scores.recall(test_actual,res) precision = precision_score(tagged_actual,tagged_result) print "precision:"+str(precision) recall = recall_score(tagged_actual,tagged_result) print "recall:"+str(recall) f1 = f1_score(tagged_actual,tagged_result) print "F1_score:"+str(f1)
def onsentencelist(): ct = CRFTagger() """sentencelist contains nertaged sentences""" sentencelist = pickle.load(open('sentencelist.pickle','rb')) """training size as percentage""" trainingsize = 0.9 """ calculate where to split data """ limit = round(trainingsize*len(sentencelist)) """wordsentencelist contains the same sentences not ner-tagged""" wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb")) """train the data / choose one of the 2 blocks """ #train_data = sentencelist[:limit] #ct.train(train_data,'model.crf.tagger') ct.set_model_file('tweetmodel.crf.tagger') """Test data and evaluate""" test_data = wordsentencelist[limit:] ct.tag_sents(test_data) # tagging sentences gold_sentences = sentencelist[limit:] print("\nAccuracy:", ct.evaluate(gold_sentences)) """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING ONLY THE TRUE AND PREDTAGS""" pred_nerlist = [] for sentence in wordsentencelist[:limit]: for (word,nertag) in ct.tag(sentence): #pred_nerlist.append((word,nertag)) pred_nerlist.append(nertag.lower()) true_nerlist = [] #ct_true = gold_sentences for sentence in sentencelist[:limit]: for (word,nertag) in sentence: #true_nerlist.append((word,nertag)) true_nerlist.append(nertag.lower()) """ Print baseline """ #print("\nBaseline = 0.9048987094135446 (everything tagged O)") """"Print F-score and confusion matrix """ #print(len(pred_nerlist)) #print(len(true_nerlist)) """"Print F-score and confusion matrix """ print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') ) print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') ) print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') ) print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"])) print("\nConfusion matrix:\n") for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print(" ",item,end="") print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
train_data.append(lists_e) print('Training the CRF++ model started') ct.train(train_data, 'model.crf.tagger') # training the crf model using the training set print('Training the CRF++ model completed') test_number_lines = 0 print('Reading Unseen Data') for lines in test_set: test_number_lines = test_number_lines + 1 print('Processing file', test_number_lines) xx = (lines.split()) test_data.append(xx) tagged_sent = ct.tag_sents(test_data) # tagging the the unseen data for tag_set in tagged_sent: tagged_text.write(str(tag_set)) tagged_text.write('\n') tagged_text.close() print('Tagging the Unseen data Completed') for vz in gold_line: vz = "[" + vz + "]" gold_list = ast.literal_eval(vz) gold_data.append(gold_list) print('The Total Accuracy of the System is:' + str(ct.evaluate(gold_data)))