Ejemplo n.º 1
0
def test_bio_f_score():
    # Check against putputs from the "conlleval" Perl script from CoNLL 2002.
    examples = [("OBIO", "OBIO", 1.), ("BII", "OBI", 0.), ("BB", "BI", 0.),
                ("BBII", "BBBB", 1 / 3.), ("BOOBIB", "BOBOOB", 2 / 3.),
                ("B-PER I-PER O B-PER I-PER O O B-LOC O".split(),
                 "B-LOC I-LOC O B-PER I-PER O O B-LOC I-LOC".split(), 1 / 3.)]

    for y_true, y_pred, score in examples:
        y_true = list(y_true)
        y_pred = list(y_pred)
        assert_equal(score, bio_f_score(y_true, y_pred))
Ejemplo n.º 2
0
def test_bio_f_score():
    # Outputs from with the "conlleval" Perl script from CoNLL 2002.
    examples = [
        ("OBIO", "OBIO", 1.),
        ("BII", "OBI", 0.),
        ("BB", "BI", 0.),
        ("BBII", "BBBB", 1 / 3.),
        ("BOOBIB", "BOBOOB", 2 / 3.),
    ]

    for y_true, y_pred, score in examples:
        y_true = list(y_true)
        y_pred = list(y_pred)
        assert_equal(score, bio_f_score(y_true, y_pred))
Ejemplo n.º 3
0
def test_bio_f_score():
    # Check against putputs from the "conlleval" Perl script from CoNLL 2002.
    examples = [
        ("OBIO", "OBIO", 1.),
        ("BII", "OBI", 0.),
        ("BB", "BI", 0.),
        ("BBII", "BBBB", 1 / 3.),
        ("BOOBIB", "BOBOOB", 2 / 3.),
        ("B-PER I-PER O B-PER I-PER O O B-LOC O".split(),
         "B-LOC I-LOC O B-PER I-PER O O B-LOC I-LOC".split(),
         1 / 3.)
    ]

    for y_true, y_pred, score in examples:
        y_true = list(y_true)
        y_pred = list(y_pred)
        assert_equal(score, bio_f_score(y_true, y_pred))
Ejemplo n.º 4
0
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True,max_iter = 10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
Ejemplo n.º 5
0
def analyze_results(y_true,y_pred):
    print len(y_true), len(y_pred)

    #for v in ['B-Iden','B-Ind']:

    print("Accuracy: %.3f" % (100 * accuracy_score(y_true, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_true, y_pred)))
    got_id = 0
    tot_true_id = 0
    tot_false_id = 0
    for i, lab in enumerate(y_true):
        if lab != 'O':
            tot_true_id +=1
        if y_pred[i] != 'O':
            tot_false_id += 1
        if lab != 'O' and y_pred[i] != 'O':
            got_id +=1
    if tot_true_id > 0 and tot_false_id > 0:
        print got_id/float(tot_true_id), got_id/float(tot_false_id)
Ejemplo n.º 6
0
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(
        os.path.join(current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True, max_iter=10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(
        os.path.join(current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
Ejemplo n.º 7
0
    test_files = [f for i, f in enumerate(files) if i % 5 == 0]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test


if __name__ == "__main__":
    print(__doc__)

    #print("Loading training data...", end=" ")
    #X_train, y_train, lengths_train = load_conll(sys.argv[1], features)
    #describe(X_train, lengths_train)

    train, test = load_data()
    X_train, y_train, lengths_train = train
    X_test, y_test, lengths_test = test

    #print("Loading test data...", end=" ")
    #X_test, y_test, lengths_test = load_conll(sys.argv[2], features)
    #describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, max_iter=10)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
Ejemplo n.º 8
0
            yield "folUpper"
        if re.search(r"\d", nnp.lower()):
            yield "folNumber"
        yield "folword=" + nnp.lower()
    if p.isupper() and len(p) == 3:
        yield "Uppercase"
    if re.search(r"\d", p.lower()):
        yield "Number"
    if len(p) > 8:  # check if current word is unusually long
        yield "Long"


if __name__ == '__main__':
    train_path = "../Data/bio-ner/train"
    dev_path = "../Data/bio-ner/dev"

    # create_file(train_path, "train")
    # create_file(dev_path, "dev")

    X_train, y_train, l_train = load_conll("train", features)
    X_test, y_test, l_test = load_conll("dev", features)

    per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1)
    per.fit(X_train, y_train, l_train)

    y_p = per.predict(X_test, l_test)
    # for x in zip(y_p, y_test):
    #     print(x)

    print(bio_f_score(y_test, y_p))
Ejemplo n.º 9
0
    test_files = [f for i, f in enumerate(files) if i % 5 == 0]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test


if __name__ == "__main__":
    print(__doc__)

    #print("Loading training data...", end=" ")
    #X_train, y_train, lengths_train = load_conll(sys.argv[1], features)
    #describe(X_train, lengths_train)

    train, test = load_data()
    X_train, y_train, lengths_train = train
    X_test, y_test, lengths_test = test

    #print("Loading test data...", end=" ")
    #X_test, y_test, lengths_test = load_conll(sys.argv[2], features)
    #describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, max_iter=10)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
Ejemplo n.º 10
0
def get_bio_f1(y_true, y_pred, label_mapping=None):
    str_true, str_pred = int_to_str(y_true, y_pred, label_mapping)
    fscore = bio_f_score(str_true, str_pred)
    return fscore
Ejemplo n.º 11
0
def bio_f1_crf(y_true, y_pred):
    y_true = flatten_y(y_true)
    y_pred = flatten_y(y_pred)
    return bio_f_score(y_true, y_pred)
Ejemplo n.º 12
0
            if s.isalpha():
                predict_seq.append(s)

    file.close()

    file = open("predict_seq.txt", "w")
    for ch in predict_seq:
        c = ch.join('   ')
        file.write(c)
        file.write('\n')

    file.close()

    X_test, y_test, lengths_test = load_conll("predict_seq.txt", features)
    y_pred = clf.predict(X_test, lengths_test)
    print(bio_f_score(y_test, y_pred))

    print(lengths_test)

    print(X_test)
    print(y_test)

    file = open(output_list_name[count], "w")
    for i in y_pred:
        file.write(i)
        #file.write(np.array2string(i, precision=2, separator=','))

    file.close()
    count += 1
    predict_seq.clear()
Ejemplo n.º 13
0
kf = KFold(p.L.shape[0], n_folds=2)

for train_ids, test_ids in kf:

    L_train = p.L[train_ids]
    L_test = p.L[test_ids]
    X_train = np.zeros((0, p.X.shape[1]))
    y_train = np.zeros((0,))
    X_test = np.zeros((0, p.X.shape[1]))
    y_test = np.zeros((0,))

    for i, l in enumerate(L_train):
        start = sum(L_train[:i])
        end = sum(L_train[:i+1])
        X_train = np.vstack([X_train, p.X[start:end]])
        y_train = np.append(y_train, p.Y[start:end])

    for i, l in enumerate(L_test):
        start = sum(L_test[:i])
        end = sum(L_test[:i+1])
        X_test = np.vstack([X_test, p.X[start:end]])
        y_test = np.append(y_test, p.Y[start:end])

    clf = StructuredPerceptron()
    clf.fit(X_train, y_train, L_train)

    y_pred = clf.predict(X_test, L_test)

    print("The Bio Score for Kfold = %s" % bio_f_score(y_test, y_pred))