def test_bio_f_score(): # Check against putputs from the "conlleval" Perl script from CoNLL 2002. examples = [("OBIO", "OBIO", 1.), ("BII", "OBI", 0.), ("BB", "BI", 0.), ("BBII", "BBBB", 1 / 3.), ("BOOBIB", "BOBOOB", 2 / 3.), ("B-PER I-PER O B-PER I-PER O O B-LOC O".split(), "B-LOC I-LOC O B-PER I-PER O O B-LOC I-LOC".split(), 1 / 3.)] for y_true, y_pred, score in examples: y_true = list(y_true) y_pred = list(y_pred) assert_equal(score, bio_f_score(y_true, y_pred))
def test_bio_f_score(): # Outputs from with the "conlleval" Perl script from CoNLL 2002. examples = [ ("OBIO", "OBIO", 1.), ("BII", "OBI", 0.), ("BB", "BI", 0.), ("BBII", "BBBB", 1 / 3.), ("BOOBIB", "BOBOOB", 2 / 3.), ] for y_true, y_pred, score in examples: y_true = list(y_true) y_pred = list(y_pred) assert_equal(score, bio_f_score(y_true, y_pred))
def test_bio_f_score(): # Check against putputs from the "conlleval" Perl script from CoNLL 2002. examples = [ ("OBIO", "OBIO", 1.), ("BII", "OBI", 0.), ("BB", "BI", 0.), ("BBII", "BBBB", 1 / 3.), ("BOOBIB", "BOBOOB", 2 / 3.), ("B-PER I-PER O B-PER I-PER O O B-LOC O".split(), "B-LOC I-LOC O B-PER I-PER O O B-LOC I-LOC".split(), 1 / 3.) ] for y_true, y_pred, score in examples: y_true = list(y_true) y_pred = list(y_pred) assert_equal(score, bio_f_score(y_true, y_pred))
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True,max_iter = 10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
def analyze_results(y_true,y_pred): print len(y_true), len(y_pred) #for v in ['B-Iden','B-Ind']: print("Accuracy: %.3f" % (100 * accuracy_score(y_true, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_true, y_pred))) got_id = 0 tot_true_id = 0 tot_false_id = 0 for i, lab in enumerate(y_true): if lab != 'O': tot_true_id +=1 if y_pred[i] != 'O': tot_false_id += 1 if lab != 'O' and y_pred[i] != 'O': got_id +=1 if tot_true_id > 0 and tot_false_id > 0: print got_id/float(tot_true_id), got_id/float(tot_false_id)
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll( os.path.join(current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True, max_iter=10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll( os.path.join(current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
test_files = [f for i, f in enumerate(files) if i % 5 == 0] test = load_conll(fileinput.input(test_files), features) X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test if __name__ == "__main__": print(__doc__) #print("Loading training data...", end=" ") #X_train, y_train, lengths_train = load_conll(sys.argv[1], features) #describe(X_train, lengths_train) train, test = load_data() X_train, y_train, lengths_train = train X_test, y_test, lengths_test = test #print("Loading test data...", end=" ") #X_test, y_test, lengths_test = load_conll(sys.argv[2], features) #describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, max_iter=10) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
yield "folUpper" if re.search(r"\d", nnp.lower()): yield "folNumber" yield "folword=" + nnp.lower() if p.isupper() and len(p) == 3: yield "Uppercase" if re.search(r"\d", p.lower()): yield "Number" if len(p) > 8: # check if current word is unusually long yield "Long" if __name__ == '__main__': train_path = "../Data/bio-ner/train" dev_path = "../Data/bio-ner/dev" # create_file(train_path, "train") # create_file(dev_path, "dev") X_train, y_train, l_train = load_conll("train", features) X_test, y_test, l_test = load_conll("dev", features) per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1) per.fit(X_train, y_train, l_train) y_p = per.predict(X_test, l_test) # for x in zip(y_p, y_test): # print(x) print(bio_f_score(y_test, y_p))
def get_bio_f1(y_true, y_pred, label_mapping=None): str_true, str_pred = int_to_str(y_true, y_pred, label_mapping) fscore = bio_f_score(str_true, str_pred) return fscore
def bio_f1_crf(y_true, y_pred): y_true = flatten_y(y_true) y_pred = flatten_y(y_pred) return bio_f_score(y_true, y_pred)
if s.isalpha(): predict_seq.append(s) file.close() file = open("predict_seq.txt", "w") for ch in predict_seq: c = ch.join(' ') file.write(c) file.write('\n') file.close() X_test, y_test, lengths_test = load_conll("predict_seq.txt", features) y_pred = clf.predict(X_test, lengths_test) print(bio_f_score(y_test, y_pred)) print(lengths_test) print(X_test) print(y_test) file = open(output_list_name[count], "w") for i in y_pred: file.write(i) #file.write(np.array2string(i, precision=2, separator=',')) file.close() count += 1 predict_seq.clear()
kf = KFold(p.L.shape[0], n_folds=2) for train_ids, test_ids in kf: L_train = p.L[train_ids] L_test = p.L[test_ids] X_train = np.zeros((0, p.X.shape[1])) y_train = np.zeros((0,)) X_test = np.zeros((0, p.X.shape[1])) y_test = np.zeros((0,)) for i, l in enumerate(L_train): start = sum(L_train[:i]) end = sum(L_train[:i+1]) X_train = np.vstack([X_train, p.X[start:end]]) y_train = np.append(y_train, p.Y[start:end]) for i, l in enumerate(L_test): start = sum(L_test[:i]) end = sum(L_test[:i+1]) X_test = np.vstack([X_test, p.X[start:end]]) y_test = np.append(y_test, p.Y[start:end]) clf = StructuredPerceptron() clf.fit(X_train, y_train, L_train) y_pred = clf.predict(X_test, L_test) print("The Bio Score for Kfold = %s" % bio_f_score(y_test, y_pred))