def train(self): """ 训练 """ self.initialize_model() x_train, y_train = self.corpus.generator() self.model.fit(x_train, y_train) labels = list(self.model.classes_) x_test, y_test = self.corpus.generator(train=False) y_predict = self.model.predict(x_test) metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3)) self.save_model()
def train_full(data=None): data = data or get_tokenizer() train_sents, test_sents = train_test_split(data, test_size=0.2, shuffle=False) X_train = [sent2features(sent2tokens(s)) for s in data] y_train = [sent2labels(s) for s in data] X_test = [sent2features(sent2tokens(s)) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, model_filename='models/model.bin' ) crf.fit(X_train, y_train) start = time.time() y_pred = crf.predict(X_test) end = time.time() test_time = end - start F1 = metrics.flat_f1_score(y_test, y_pred, average='weighted') print(F1) print("Test time: ", test_time) print(metrics.flat_classification_report( y_test, y_pred, digits=3 ))
def train_test(data=None): train_sents, dev_sents, test_sents = data or get_tokenizer() X_train = [sent2features(sent2tokens(s)) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] print(len(X_train), len(y_train)) X_dev = [sent2features(sent2tokens(s)) for s in dev_sents] y_dev = [sent2labels(s) for s in dev_sents] X_test = [sent2features(sent2tokens(s)) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, model_filename='model/model.bin' ) crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev) start = time.time() y_pred = crf.predict(X_test) end = time.time() test_time = end - start F1 = metrics.flat_f1_score(y_test, y_pred, average='weighted') print("F1: ", F1) print("Test time: ", test_time) print(metrics.flat_classification_report( y_test, y_pred, digits=3 ))
def _print_metrics(y_pred, y_true): labels = get_labels(y_true) print("Sequence accuracy: {:0.1%}".format( metrics.sequence_accuracy_score(y_true, y_pred)) ) print("Per-tag F1: {:0.3f}".format( metrics.flat_f1_score(y_true, y_pred, average='macro', labels=labels) )) print("Per-tag Classification report: \n{}".format( metrics.flat_classification_report(y_true, y_pred, labels=labels, digits=3)) )
def fscore_crf(Y, y_pred, labels): labels.remove('O') return metrics.flat_f1_score(Y, y_pred, average='weighted', labels=labels)
Xtrain = [stofeatures(s) for s in trainwords] ytrain = [stolabels(s) for s in trainwords] Xtest = [stofeatures(s) for s in testwords] ytest = [stolabels(s) for s in testwords] # In[7]: crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.15, c2=0.15, max_iterations=100, all_possible_transitions=True) crf.fit(Xtrain, ytrain) # In[8]: labels = list(crf.classes_) predicted = crf.predict(Xtest) metrics.flat_f1_score(ytest, predicted, average='weighted', labels=labels) # In[9]: labelsorted = sorted(labels, key=lambda n: (n[1:], n[0])) print( metrics.flat_classification_report(ytest, predicted, labels=labelsorted, digits=3))
crf = CRF_baseline_NER() print(crf.sent2features(conll.sentences[0])[0]) train_sents = conll.sentences[:40000] test_sents = conll.sentences[40000:] crf.X_train = [crf.sent2features(s) for s in train_sents] crf.y_train = [crf.sent2labels(s) for s in train_sents] crf.X_test = [crf.sent2features(s) for s in test_sents] crf.y_test = [crf.sent2labels(s) for s in test_sents] crf.train() labels = list(crf.crf_model.classes_) labels.remove('O') print(labels) y_pred = crf.crf_model.predict(crf.X_test) f1_score = metrics.flat_f1_score(crf.y_test, y_pred, average='weighted', labels=labels) precision_score = metrics.flat_precision_score(crf.y_test, y_pred, average='weighted', labels=labels) recall_score = metrics.flat_recall_score(crf.y_test, y_pred, average='weighted', labels=labels) stats = metrics.flat_classification_report(crf.y_test, y_pred, labels=labels) print("Precision: "+str(precision_score)) print("Recall: "+str(recall_score)) print("F1-score: "+str(recall_score)) print(stats) filename = '../Models/crf_baseline_model.sav' pickle.dump(crf.crf_model, open(filename, 'wb')) print("Done with all")
verbose=1, n_jobs=1, n_iter=20, scoring=f1_scorer) rs.fit(X_train, y_train) # In[78]: print('Best params:', rs.best_params_) print('Best F-1 score:', rs.best_score_) # In[14]: # fitting the models with obtained hyperparameters c1=.055 and c=.066 crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.055, c2=0.066, max_iterations=1000, all_possible_transitions=True, verbose=False) crf.fit(X_train, y_train) labels = ["O", "D", "T"] #predicting the entities for test data y_pred = crf.predict(X_test) print("F1 score for D, T and O label(average) is %lf " % (metrics.flat_f1_score(y_test, y_pred, average='macro', labels=labels))) #printing the classfication report showreport(y_test, y_pred)
def lbfgs(train_X, train_Y, test_X, test_Y): algorithms = ['lbfgs'] min_frequencies = [0, 0.02] all_states = [True, False] all_transitions = [True, False] c1s = [0, 0.01, 0.05, 0.1] c2s = [0, 0.01, 0.05, 0.1] i = 1 N = len(algorithms) * len(min_frequencies) * len(all_states) * len( all_transitions) * len(c1s) * len(c2s) start = time.time() results = [] for algo in algorithms: for min_freq in min_frequencies: for all_state in all_states: for all_transition in all_transitions: for c1 in c1s: for c2 in c2s: print(round(100 * i / N), '%') print('Time elapsed: {} s'.format( round(time.time() - start))) i += 1 params = { 'algo': algo, 'min_freq': min_freq, 'all_state': all_state, 'all_transition': all_transition, 'c1': c1, 'c2': c2 } print(params) try: crf = sklearn_crfsuite.CRF( algorithm=algo, c1=c1, c2=c2, max_iterations=1000, all_possible_transitions=all_transition, all_possible_states=all_state, min_freq=min_freq) crf.fit(train_X, train_Y) pred_Y = crf.predict(test_X) f1 = metrics.flat_f1_score(test_Y, pred_Y, average='weighted', labels=[ 'per', 'org', 'misc', 'loc', 'notpropn' ]) res = metrics.flat_classification_report( test_Y, pred_Y, labels=[ 'per', 'org', 'misc', 'loc', 'notpropn' ], digits=4) results.append((f1, params)) print(res) print() except: print('Invalid parameter combination.') continue file = open('results/lbfgs', 'wb') pickle.dump(results, file) file.close()
X_test = [sent2features(s) for s in chat_sequence_all] y_test = [sent2labels(s) for s in chat_sequence_all] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, verbose=True) print("starting train..........\n") crf.fit(X_train, y_train) print("Following are the classes: \n") labels = list(crf.classes_) print(labels) y_pred = crf.predict(X_test) print("weighted f1 score......\n") print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print("class wise distribution.......\n") print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) file_out.close()
# The True Labels for the Test Set Data a = [] for i in test_data: t = [] for j in i: t.append(j[1]) a.append(t) # In[22]: # To check Accuracy from sklearn_crfsuite import scorers from sklearn_crfsuite import metrics # Accuracy as the percentage of the correct tags metrics.flat_f1_score(a, ans, average='weighted', labels=labels) # In[23]: # Confusion Matrix for the Model print( metrics.flat_classification_report(a, ans, labels=sorted_labels, digits=3)) # ## Module to implement CRF. # In[24]: # pip3 install sklearn-crfsuite # install this please train_sents = corpus
def test_flat_f1_score_binary(): s = [["x", "y"], ["x", "y"]] score = metrics.flat_f1_score(s, s, average='weighted') assert score == 1.0
crf_final = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.5, c2=0.5, max_iterations=100, all_possible_transitions=True) crf_final.fit(X_train, y_train) # Just keep the 'B' and 'I' for F-1 scoring labels = list(crf_final.classes_) labels.remove('O') ######## CLASSIFICATION ######### print("Running on the training set") y_train_pred = crf_final.predict(X_train) print("F1-score" + str( metrics.flat_f1_score( y_train, y_train_pred, average='weighted', labels=labels))) if args.dev: print("Running on the dev set") y_dev_pred = crf_final.predict(X_dev) print("F1-score" + str( metrics.flat_f1_score( y_dev, y_dev_pred, average='weighted', labels=labels))) print("Running on the testing set") y_test_pred = crf_final.predict(X_test) ######## OUTPUT ######### def generate_output(pred, outputfile): f = open(outputfile, 'w')
if tok.find("PER") != -1 or tok.find("per") != -1 or tok.find( "musicartist") != -1: temp.append(3) else: if tok.find("MISC") != -1: temp.append(4) else: temp.append(4) y.append(temp) sorted_labels = definitions.KLASSES.copy() del sorted_labels[4] print("------------------------------------------------------") print flat_f1_score(y, new, average='weighted', labels=sorted_labels.keys()) print flat_f1_score(y, old, average='weighted', labels=sorted_labels.keys()) print "-----------------------------------------" print( flat_classification_report(y, new, labels=sorted_labels.keys(), target_names=sorted_labels.values(), digits=3)) print( flat_classification_report(y, old, labels=sorted_labels.keys(), target_names=sorted_labels.values(), digits=3))
def test_flat_fscore(): score = metrics.flat_f1_score(y1, y2, average='macro') assert score == 2 / 3 assert metrics.flat_fbeta_score(y1, y2, beta=1, average='macro') == score
def gen_model(self, x_train, y_train, x_test, y_test): for i in range(len(y_train)): for j in range(len(y_train[i])): y_train[i][j] = y_train[i][j].replace('B-', '') y_train[i][j] = y_train[i][j].replace('O-', '') y_train[i][j] = y_train[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO'] # labels = ['m', 'r', 'f', 'do', 'du', 'mo'] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rand_search = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rand_search.fit(x_train, y_train) crf = rand_search.best_estimator_ y_prediction = crf.predict(x_test) # group B and I results sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) joblib.dump(crf, 'model.pkl') precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) return metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3)
def validate_performance(self, test_set): sentences = self.__load_corpus__(test_set) y_test = [self.model.sentence2labels(s) for s in sentences] y_prediction = [] for i, sent in enumerate(sentences): new_sent = ' '.join([word[0] for word in sent]) prediction = self.model.predict(new_sent) new_prediction = [] if len(prediction) > 1: for p in prediction: new_prediction += [p1 for p1 in p] # print(prediction) # print(new_prediction) prediction = new_prediction else: prediction = prediction[0] try: pred = [w[1] for w in prediction] except Exception: print(prediction) return # if len(pred) != len(y_test[i]): # print(sent) # print(new_sent) # print(y_test[i]) # print(len(y_test[i])) # print(pred) # print(len(pred)) y_prediction.append(pred) labels = [ 'O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO' ] for i in range(len(y_prediction)): for j in range(len(y_prediction[i])): y_prediction[i][j] = y_prediction[i][j].replace('B-', '') y_prediction[i][j] = y_prediction[i][j].replace('O-', '') y_prediction[i][j] = y_prediction[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['DOS', 'UNIT', 'WHO', 'DUR', 'FREQ'] sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) print( metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3))
crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') y_pred = crf.predict(X_test) # print(y_pred) print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=['B-geo', 'I-geo'])) tweetsFile = 'datasets/tweet-dataset.csv' tweetsTestData = readTweetForTesting(tweetsFile)['tweets'].head(50) # pipelineModelFile = 'multinomialNB.pkl' # pipeline = loadModel(pipelineModelFile) # result = predictLocation(tweetsTestData) # print(result) # Save Model # saveModel(crf, 'namedEntityRecognition.pkl')
def train(self, test_size=0.2, max_iterations=100, fold5valid=False): full_set_labels = [] for sent in self.full_set: set_lab = [] for word in sent: set_lab.append(word[1]) full_set_labels.append(set_lab) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.full_set, full_set_labels, test_size=test_size, random_state=0) self.x_train = [self.sent2features(s) for s in self.x_train] self.x_test = [self.sent2features(s) for s in self.x_test] print("Starting Training on " + str(len(self.x_train)) + " sentences...") batch_size = len(self.x_train) / 5 scores = [] if fold5valid: for i in range(5): self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=max_iterations, all_possible_transitions=True) indices = range(i * batch_size, (i + 1) * batch_size) train_batch = [ i for j, i in enumerate(self.x_train) if j not in indices ] test_batch = [ i for j, i in enumerate(self.x_train) if j in indices ] train_labels = [ i for j, i in enumerate(self.y_train) if j not in indices ] test_labels = [ i for j, i in enumerate(self.y_train) if j in indices ] self.crf.fit(train_batch, train_labels) labels = list(self.crf.classes_) labels.remove("N") y_pred = self.crf.predict(test_batch) val = metrics.flat_f1_score(test_labels, y_pred, average='weighted', labels=labels) scores.append(val) import numpy scores = numpy.array(scores) print("5 Fold scores:" + str(scores)) f1score = scores.mean(), scores.std() * 2 print("F1 Score: %0.2f (+/- %0.2f)" % (f1score)) #self.crf.fit(self.x_train, self.y_train) self.trained = True print("Finished training...") return f1score else: import scipy from sklearn.metrics import make_scorer from sklearn.grid_search import RandomizedSearchCV self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs', all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), 'max_iterations': range(20, 100), } self.crf.fit(self.x_train, self.y_train) labels = list(self.crf.classes_) labels.remove('N') # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(self.crf, params_space, cv=5, verbose=1, n_jobs=-1, n_iter=100, scoring=f1_scorer) rs.fit(self.x_train, self.y_train) # crf = rs.best_estimator_ print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) self.trained = True return rs
c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') y_pred = crf.predict(X_test) print("Full Test Accuracy:", crf.score(X_test, y_test)) print( "Full Test F1 Score:", metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=crf.classes_)) print("Trimmed Test F1 Score:", metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) """========================== append prior predictions and re-classify ========================""" for sent, labels in enumerate(y_train): for word, label in enumerate(labels): if word > 0: X_train[sent][word]['prev.ent'] = labels[
X_test = [X[i] for i in range(len(X)) if groups[i] == gid] y_test = [Y[i] for i in range(len(Y)) if groups[i] == gid] %%time crf = sklearn_crfsuite.CRF( algorithm='pa', c=0.1, max_iterations=100, all_possible_transitions=False ) crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') labels y_pred = crf.predict(X_test) cross_val_results.append(metrics.flat_f1_score(y_test, y_pred, average='macro', labels=labels)) np.mean(cross_val_results) def grid_search(X, y, labels): crf = sklearn_crfsuite.CRF( algorithm='pa', max_iterations=100, all_possible_transitions=False ) params_space = { 'c': [0.1] }
from os.path import join, dirname import time import joblib import pycrfsuite from sklearn_crfsuite import metrics from load_data import load_dataset transformer = joblib.load(join(dirname(__file__), "model", "transformer.bin")) path = join(dirname(__file__), "model", "model.bin") estimator = pycrfsuite.Tagger() estimator.open(path) test_set = load_dataset( join(dirname(dirname(dirname(__file__))), "data", "vlsp2016", "corpus", "test.txt")) X_test, y_test = transformer.transform(test_set) start = time.time() y_pred = [estimator.tag(x) for x in X_test] end = time.time() test_time = end - start f1_test_score = metrics.flat_f1_score(y_test, y_pred, average='weighted') print("F1 score: ", f1_test_score) print("Test time: ", test_time) with open("report.txt", "w") as f: f.write("F1 score: " + str(f1_test_score) + "\n" + "Test time: " + str(test_time))
c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) from sklearn.model_selection import cross_val_predict from sklearn_crfsuite.metrics import flat_classification_report pred = cross_val_predict(estimator=crf, X=Xtrain, y=Ytrain, cv=5) report = flat_classification_report(y_pred=pred, y_true=Ytrain) #%% crf.fit(Xtrain, Ytrain) y_pred = crf.predict(Xtest) metrics.flat_f1_score(Ytest, y_pred, average='weighted') print(metrics.flat_classification_report(Ytest, y_pred, digits=3)) #%% print(report) print(y_pred[0]) print(output_sql[2000]) print(main_lst1[2000]) print() #%% X = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(lem_data) #print(ngram_vectorizer.fit(data))
def unified_approach(file_name, file_2): # train_path = "../Data/bio-ner/train" # dev_path = "../Data/bio-ner/dev" # create_file(train_path, "train") # create_file(dev_path, "dev") #exclude = ["Value", "Time", "Unit", "Location"] train_sentences = file_opener(file_name) dev_sentences = file_opener(file_2) x_train = [sentence_features(s) for s in train_sentences] y_train = [sentence_labels(s) for s in train_sentences] x_dev = [sentence_features(s) for s in dev_sentences] y_dev = [sentence_labels(s) for s in dev_sentences] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.09684573395986483, c2=0.0800864058815976, max_iterations=100, all_possible_transitions=True ) crf.fit(x_train, y_train) labels = list(crf.classes_) labels.remove('O') y_predicted = crf.predict(x_dev) # Get the various lists for evaluation of separate label parts y_pred_flat = [] y_pred_iob = [] y_pred_class = [] y_dev_flat = [] y_dev_iob = [] y_dev_class = [] for x in y_predicted: y_pred_flat += x for xx in x: y_pred_iob.append(xx[0]) if xx != 'O': y_pred_class.append(xx[2:]) else: y_pred_class.append('O') for x in y_dev: y_dev_flat += x for xx in x: y_dev_iob.append(xx[0]) if xx != 'O': y_dev_class.append(xx[2:]) else: y_dev_class.append('O') # print(set(y_pred_flat) - set(y_dev_flat)) # print(set(y_dev_flat) - set(y_pred_flat)) # print(set(y_pred_flat)) # print(set(y_dev_flat)) # print(labels) labels = list(set(y_pred_flat)) labels.remove("O") print(labels) #labels = ["B-Biotic_Entity-L"] f1 = metrics.flat_f1_score(y_dev, y_predicted, average='weighted', labels=labels) # labels = list(set(y_pred_iob)) # labels.remove('O') # iob_score = f1_score(y_dev_iob, y_pred_iob, average='weighted', labels=labels) # print("IOB Score:", iob_score) # labels = list(set(y_pred_class)) # labels.remove('O') # class_score = f1_score(y_dev_class, y_pred_class, average='weighted', labels=labels) # print("Class Score:", class_score) print("Overall Score:", f1) return f1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6) crf = CRF(algorithm = 'lbfgs', c1 = 0.1, c2 = 0.1, max_iterations = 100, all_possible_transitions = False) crf.fit(X_train, y_train) #Predicting on the test set. y_pred = crf.predict(X_test) for i in range(len(y_pred)): prediction = y_pred[i] testList = X_test[i] testSentence = "" for testTuple in testList: testSentence = testSentence + testTuple['word.lower()'] + ' ' words = testSentence.split(" ") x = 0 for wordPrediction in prediction: if wordPrediction == 'B-date' or wordPrediction == 'B-amt' or wordPrediction == 'B-mer' or wordPrediction == 'I-mer' or wordPrediction == 'I-date': print(words[x],wordPrediction) x +=1 f1_score = flat_f1_score(y_test, y_pred, average = 'weighted') print(f1_score) report = flat_classification_report(y_test, y_pred) print(report)
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) y_pred = crf.predict(X_test) labels = list(crf.classes_) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print('Number of test sentences used = 10') print('----------------------Viterbi Results---------------------------') print( 'Viterbi Accuracy Score :', metrics.flat_f1_score(actual_tag, seq, average='weighted', labels=labels)) print( metrics.flat_classification_report(actual_tag, seq, labels=sorted_labels, digits=3)) print('------------------------CRF Results-----------------------------') print('CRF Accuracy Score :', metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
def cross_validate(self, training_dataset=None, num_folds=5, prediction_directory=None, groundtruth_directory=None, asynchronous=False): """ Performs k-fold stratified cross-validation using our model and pipeline. If the training dataset, groundtruth_directory and prediction_directory are passed, intermediate predictions during cross validation are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute the prediction ambiguity with the methods present in the Dataset class to support pipeline development without a designated evaluation set. :param training_dataset: Dataset that is being cross validated (optional) :param num_folds: number of folds to split training data into for cross validation :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. :param groundtruth_directory: directory to write the ground truth MedaCy evaluates on :param asynchronous: Boolean for whether the preprocessing should be done asynchronously. :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1, but is %s" % repr(num_folds)) if prediction_directory is not None and training_dataset is None: raise ValueError( "Cannot generate predictions during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) if groundtruth_directory is not None and training_dataset is None: raise ValueError( "Cannot generate groundtruth during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) pipeline_report = self.pipeline.get_report() self.preprocess(training_dataset, asynchronous) if not (self.X_data and self.y_data): raise RuntimeError( "Must have features and labels extracted for cross validation") tags = sorted(training_dataset.get_labels(as_list=True)) self.pipeline.entities = tags logging.info('Tagset: %s', tags) eval_stats = {} # Dict for storing mapping of sequences to their corresponding file groundtruth_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } preds_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } folds = create_folds(self.y_data, num_folds) for fold_num, fold_data in enumerate(folds, 1): train_indices, test_indices = fold_data fold_statistics = {} learner_name, learner = self.pipeline.get_learner() X_train = [self.X_data[index] for index in train_indices] y_train = [self.y_data[index] for index in train_indices] X_test = [self.X_data[index] for index in test_indices] y_test = [self.y_data[index] for index in test_indices] logging.info("Training Fold %i", fold_num) train_data = [x[0] for x in X_train] test_data = [x[0] for x in X_test] learner.fit(train_data, y_train) y_pred = learner.predict(test_data) if groundtruth_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) groundtruth = [ element for sentence in y_test for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(groundtruth): if groundtruth[i] == 'O': i += 1 continue entity = groundtruth[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(groundtruth) - 1 and groundtruth[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] groundtruth_by_document[document].append( (entity, first_start, last_end)) i += 1 if prediction_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) predictions = [ element for sentence in y_pred for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(predictions): if predictions[i] == 'O': i += 1 continue entity = predictions[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(predictions) - 1 and predictions[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] preds_by_document[document].append( (entity, first_start, last_end)) i += 1 # Write the metrics for this fold. for label in tags: fold_statistics[label] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) } # add averages fold_statistics['system'] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=tags), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=tags), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=tags) } table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in tags + ['system']] logging.info( '\n' + tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) eval_stats[fold_num] = fold_statistics statistics_all_folds = {} for label in tags + ['system']: statistics_all_folds[label] = { 'precision_average': mean(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_max': max(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_min': min(eval_stats[fold][label]['precision'] for fold in eval_stats), 'recall_average': mean(eval_stats[fold][label]['recall'] for fold in eval_stats), 'recall_max': max(eval_stats[fold][label]['recall'] for fold in eval_stats), 'f1_average': mean(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_max': max(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_min': min(eval_stats[fold][label]['f1'] for fold in eval_stats), } entity_counts = training_dataset.compute_counts() table_data = [ [ f"{label} ({entity_counts[label]})", # Entity (Count) format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in tags + ['system'] ] # Combine the pipeline report and the resulting data, then log it or print it (whichever ensures that it prints) output_str = '\n' + pipeline_report + '\n\n' + tabulate( table_data, headers=[ 'Entity (Count)', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') if logging.root.level > logging.INFO: print(output_str) else: logging.info(output_str) if prediction_directory: prediction_directory = os.path.join( training_dataset.data_directory, "predictions") groundtruth_directory = os.path.join( training_dataset.data_directory, "groundtruth") # Write annotations generated from cross-validation self.create_annotation_directory(directory=prediction_directory, training_dataset=training_dataset, option="predictions") # Write medaCy ground truth generated from cross-validation self.create_annotation_directory(directory=groundtruth_directory, training_dataset=training_dataset, option="groundtruth") # Add predicted/known annotations to the folders containing groundtruth and predictions respectively self.predict_annotation_evaluation( directory=groundtruth_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="groundtruth") self.predict_annotation_evaluation( directory=prediction_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="predictions") return Dataset(prediction_directory) else: return statistics_all_folds
def get_f1_score(self): return metrics.flat_f1_score(self.y_test, self.y_predict, average='weighted', labels=self.labels)
print("=======================") print("Load trained model ...") model = pickle.load(open("./models/" + MODEL_NAME, "rb")) print("Done!!!") predict = model.predict(X_test) print("=======================") print("Testing ....") print(len(y_test), len(predict)) avg_count = 0 print(predict[0]) for i in range(len(y_test)): acc = evaluate(predict[i], y_test[i]) # print(acc) avg_count += acc # print(score) print("Avg acc:", avg_count / float(len(y_test))) print(model.classes_) print("Accuracy\t:", metrics.flat_accuracy_score(y_test, predict)) print("Precision\t:", metrics.flat_precision_score(y_test, predict, average=None)) print("Recall\t:", len(metrics.flat_recall_score(y_test, predict, average=None))) print("F1\t:", metrics.flat_f1_score(y_test, predict, average=None)) print("Done!!!")
def test_flat_fscore(): score = metrics.flat_f1_score(y1, y2, average='macro') assert score == 2 / 3 assert metrics.flat_fbeta_score(y1, y2, beta=1, average='macro') == score
def cross_validate(self, folds=10, training_dataset=None, spacy_model_name=None, epochs=None): """ Runs a cross validation. :param folds: Number of fold to do for the cross validation. :param training_dataset: Path to the directory of BRAT files to use for the training data. :param spacy_model_name: Name of the spaCy model to start from. :param epochs: Number of epochs to us for every fold training. """ if folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1") if training_dataset is None: raise ValueError("Need a dataset to evaluate") if spacy_model_name is None: raise ValueError("Need a spacy model to start with") train_data = training_dataset.get_training_data() x_data, y_data = zip(*train_data) skipped_files = [] evaluation_statistics = {} folds = SequenceStratifiedKFold(folds=folds) fold = 1 for train_indices, test_indices in folds(x_data, y_data): logging.info("\n----EVALUATING FOLD %d----", fold) self.model = None fold_statistics = {} x_subdataset = training_dataset.get_subdataset(train_indices) self.fit(x_subdataset, spacy_model_name, epochs) logging.info('Done training!\n') nlp = self.model labels = list(x_subdataset.get_labels()) y_subdataset = training_dataset.get_subdataset(test_indices) y_test = [] y_pred = [] for data_file in y_subdataset.get_data_files(): ann_path = data_file.get_annotation_path() annotations = Annotations(ann_path) txt_path = data_file.get_text_path() with open(txt_path, 'r') as source_text_file: text = source_text_file.read() doc = nlp(text) test_entities = annotations.get_spacy_entities() test_entities = self.entities_to_biluo(doc, test_entities) y_test.append(test_entities) pred_entities = self.predict(text) pred_entities = self.entities_to_biluo(doc, pred_entities) y_pred.append(pred_entities) logging.debug('\n------y_test------') logging.debug(y_test) logging.debug('\n------y_pred------') logging.debug(y_pred) # Write the metrics for this fold. for label in labels: fold_statistics[label] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]) f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) fold_statistics[label]['precision'] = precision fold_statistics[label]['recall'] = recall fold_statistics[label]['f1'] = f1_score # add averages fold_statistics['system'] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels) f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) fold_statistics['system']['precision'] = precision fold_statistics['system']['recall'] = recall fold_statistics['system']['f1'] = f1_score table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in labels + ['system']] logging.info( tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) evaluation_statistics[fold] = fold_statistics fold += 1 if skipped_files: logging.info('\nWARNING. SKIPPED THE FOLLOWING ANNOTATIONS:') logging.info(skipped_files) statistics_all_folds = {} for label in labels + ['system']: statistics_all_folds[label] = {} statistics_all_folds[label]['precision_average'] = mean([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_max'] = max([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_min'] = min([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_average'] = mean([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_max'] = max([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_min'] = min([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_average'] = mean([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_max'] = max([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_min'] = min([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in labels + ['system']] table_string = '\n' + tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') logging.info(table_string)
X_train = [sent2features(s) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] pprint.pprint(X_train[0]) print(len(X_train)) pprint.pprint(y_train[0]) print(len(y_train)) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') print(labels) y_pred = crf.predict(X_test) print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))
if __name__ == '__main__': # load data train_set = [] for f in ["train.txt", "dev.txt", "test.txt"]: file = join(dirname(dirname(dirname(__file__))), "data", "vlsp2016", "corpus", f) train_set += load_dataset(file) # transformer transformer = CustomTransformer(template) X, y = transformer.transform(train_set) # train crf_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } model_path = join(dirname(__file__), "final_model", "model.bin") X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.01) estimator = CRF(params=crf_params, filename=model_path) estimator.fit(X_train, y_train) y_pred = estimator.predict(X_dev) f1_score = metrics.flat_f1_score(y_dev, y_pred, average='weighted') print("Dev score: ", f1_score) joblib.dump(transformer, "final_model/transformer.bin")
def cross_validate(x_folds, y_folds, params): f1_per = [] f1_org = [] f1_misc = [] f1_loc = [] f1_not = [] precision_per = [] precision_org = [] precision_misc = [] precision_loc = [] precision_not = [] recall_per = [] recall_org = [] recall_misc = [] recall_loc = [] recall_not = [] for i in range(len(x_folds)): print('\rWorking on fold {}/{} ...'.format(i + 1, len(x_folds)), end='') crf = sklearn_crfsuite.CRF(**params) test_x, test_y, train_x, train_y = folds_2_tt(x_folds, y_folds, i) crf.fit(train_x, train_y) pred_y = crf.predict(test_x) f1_per.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['per'])) f1_org.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['org'])) f1_misc.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['misc'])) f1_loc.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['loc'])) f1_not.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['notpropn'])) precision_per.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['per'])) precision_org.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['org'])) precision_misc.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['misc'])) precision_loc.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['loc'])) precision_not.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['notpropn'])) recall_per.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['per'])) recall_org.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['org'])) recall_misc.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['misc'])) recall_loc.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['loc'])) recall_not.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['notpropn'])) print() avg_per_f1 = sum(f1_per) / len(f1_per) avg_org_f1 = sum(f1_org) / len(f1_org) avg_loc_f1 = sum(f1_loc) / len(f1_loc) avg_misc_f1 = sum(f1_misc) / len(f1_misc) avg_not_f1 = sum(f1_not) / len(f1_not) avg_per_precision = sum(precision_per) / len(precision_per) avg_org_precision = sum(precision_org) / len(precision_org) avg_loc_precision = sum(precision_loc) / len(precision_loc) avg_misc_precision = sum(precision_misc) / len(precision_misc) avg_not_precision = sum(precision_not) / len(precision_not) avg_per_recall = sum(recall_per) / len(recall_per) avg_org_recall = sum(recall_org) / len(recall_org) avg_loc_recall = sum(recall_loc) / len(recall_loc) avg_misc_recall = sum(recall_misc) / len(recall_misc) avg_not_recall = sum(recall_not) / len(recall_not) result = { 'per': (avg_per_precision, avg_per_recall, avg_per_f1), 'org': (avg_org_precision, avg_org_recall, avg_org_f1), 'misc': (avg_misc_precision, avg_misc_recall, avg_misc_f1), 'loc': (avg_loc_precision, avg_loc_recall, avg_loc_f1), 'not': (avg_not_precision, avg_not_recall, avg_not_f1) } return result
from os.path import join, dirname import time import joblib import pycrfsuite from sklearn_crfsuite import metrics from load_data import load_dataset transformer = joblib.load(join(dirname(__file__), "model", "transformer.bin")) path = join(dirname(__file__), "model", "model.bin") estimator = pycrfsuite.Tagger() estimator.open(path) test_set = load_dataset(join(dirname(dirname(dirname(__file__))), "data", "vlsp2016", "corpus", "test.txt")) X_test, y_test = transformer.transform(test_set) start = time.time() y_pred = [estimator.tag(x) for x in X_test] end = time.time() test_time = end - start f1_test_score = metrics.flat_f1_score(y_test, y_pred, average='weighted') print("F1 score: ", f1_test_score) print("Test time: ", test_time) with open("report.txt", "w") as f: f.write("F1 score: " + str(f1_test_score) + "\n" + "Test time: " + str(test_time))
# Calculate the features x_train_features = [Parser().addr2features(address) for address in x_train] x_test_features = [Parser().addr2features(address) for address in x_test] # Train the model crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(x_train_features, y_train) y_pred = crf.predict(x_test_features) metrics.flat_f1_score(y_pred, y_test, average='weighted', labels=label_types) # group B and I results sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(y_test, y_pred, labels=label_types, digits=3)) # Model fit statistics """ precision recall f1-score support AddressNumber 1.00 1.00 1.00 119
def test_flat_f1_score_binary(): s = [["x", "y"], ["x", "y"]] score = metrics.flat_f1_score(s, s, average='weighted') assert score == 1.0
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits, predicted_tags = outputs if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: #preds = logits.detach().cpu().numpy() preds = predicted_tags out_label_ids = inputs["labels"].detach().cpu().numpy() else: #preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) preds.extend(predicted_tags) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps #preds_logits = softmax(preds, axis=2) #preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), "flat_f1": metrics.flat_f1_score(out_label_list, preds_list, average='micro', labels=["B-PROP", "I-PROP"]) } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list