コード例 #1
0
def load_data(trainingPath, testPath):
    print("Loading training data...", end=" ")
    train = load_conll(fileinput.input(trainingPath), features)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    # Filler third column of testing data with fillers because load_conll wants 3 columns
    # We are only given 2 columns in test data, will not work without
    newTest = "newTest.txt"
    postTest = open(newTest, "w+")
    with open(testPath) as file:
        for line in file:
            if line != "\n":
                # strip \n from end of file
                line = line.rstrip()
                # add filler to end of line
                postTest.write(line + "\t NonApplicable\n")
    postTest.close()

    print("Loading test data...", end=" ")
    test = load_conll(fileinput.input(newTest), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test
コード例 #2
0
def load_con(
    data,
    features,
    tts=False
):  # loads the training data and adds in POS tags can do test split too
    np.random.shuffle(data)

    if tts:  # Code for doing train-test split
        split = int(len(data) * tts)

        train = data[:split]
        test = data[split:]

        if os.path.exists("test.txt"):  # replace files instead of adding on
            os.remove("test.txt")
        if os.path.exists("gs.txt"):
            os.remove("gs.txt")

        with open("test.txt", "a") as f:  # Create teh fhe file for testing
            for s in test:
                pos = nl.pos_tag([w[0] for w in s])
                for ii in range(len(s)):
                    f.write("{}\t{}\t{}\n".format(s[ii][0], pos[ii][1],
                                                  s[ii][1]))
                f.write("\n")

        with open("gs.txt",
                  "a") as f:  #create the gold standard file for comparison
            for s in test:
                for ii in range(len(s)):
                    f.write("{}\t{}\n".format(s[ii][0], s[ii][1]))
                f.write("\n")
        X_test, y_test, l_test = load_conll("test.txt", features)
    else:
        train = data  # if not doing the tts then just use all the data to train

    if os.path.exists("train.txt"):  # replace the training fil
        os.remove("train.txt")

    with open("train.txt",
              "a") as f:  # Create the train file and add in the POS
        for s in train:
            pos = nl.pos_tag([w[0] for w in s])
            for ii in range(len(s)):
                f.write("{}\t{}\t{}\n".format(s[ii][0], pos[ii][1], s[ii][1]))
            f.write("\n")

    X_train, y_train, l_train = load_conll(
        "train.txt",
        features)  # use the load_conll function on the generated file

    if tts:  # variable return based on if you want to have a test-train split
        return X_train, X_test, y_train, y_test, l_train, l_test
    else:
        return X_train, y_train, l_train
コード例 #3
0
ファイル: utils.py プロジェクト: esilotesham/HonoursProject
def gesture_to_conll(train_data, test_data, train_labels, test_labels):
    path_to_train_file = write_conll_file("train_gestures.txt", train_data,
                                          train_labels)
    path_to_test_file = write_conll_file("test_gestures.txt", test_data,
                                         test_labels)

    x_train, y_train, train_lengths = load_conll(path_to_train_file,
                                                 extract_features)
    x_test, y_test, test_lengths = load_conll(path_to_test_file,
                                              extract_features)
    return x_train, y_train, x_test, y_test, train_lengths, test_lengths
コード例 #4
0
ファイル: _nl_conll_ner.py プロジェクト: NLeSC/xtas
def _train_ner_model():
    import sys
    if 'nose' in sys.modules:
        x_train, y_train, lengths_train = load_conll(_load_test_data(),
                                                 _features)
    else:
        x_train, y_train, lengths_train = load_conll(
                                                 _download_training_data(),
                                                 _features)

    clf = StructuredPerceptron()
    clf.fit(x_train, y_train, lengths_train)
    return clf
コード例 #5
0
def load_data():
    #Use this to load in our data so that we can pass it in to some machine learning algorithm
    #We return a training data set and a test data set
    print("Loading training data...", end=" ")
    train = load_conll(fileinput.input("gene-trainF18.txt"), features)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
    test = load_conll(fileinput.input("testset.txt"), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test
コード例 #6
0
def predict_cars(clf, sentence):
    test_f_name = os.path.join(current_dir, './../data/test_ann')

    sentence = preprocessor_text(sentence)
    tokens = word_tokenize(sentence)
    with codecs.open(test_f_name, 'w', 'utf-8') as f:

        for t in tokens:
            f.write(t + u' ' + u'O' + u'\n')
        f.flush()

    X, y, lengths = load_conll(test_f_name, features)
    y_pred = clf.predict(X, lengths)

    found_cars = []

    current_car = []
    found_car = False
    for idx, token in enumerate(y_pred):
        t = str(token)
        if t == 'B':

            current_car.append(tokens[idx])
            found_car = True
        elif t == 'I':
            current_car.append(tokens[idx])
        else:
            if found_car:
                found_car = False
                found_cars.append(u' '.join(current_car))
                current_car = []
    if len(current_car) > 0:
        found_cars.append(u' '.join(current_car))
    return found_cars
コード例 #7
0
def predict_cars(clf, sentence):
    test_f_name = os.path.join( current_dir, './../data/test_ann' )

    sentence = preprocessor_text( sentence )
    tokens = word_tokenize(sentence)
    with codecs.open(test_f_name, 'w', 'utf-8') as f:

        for t in tokens:
            f.write(t + u' ' + u'O' + u'\n')
        f.flush()

    X, y, lengths= load_conll(test_f_name, features)
    y_pred = clf.predict(X, lengths)

    found_cars = []

    current_car = []
    found_car = False
    for idx, token in enumerate( y_pred ):
        t = str(token)
        if t == 'B':

            current_car.append(tokens[idx])
            found_car = True
        elif t == 'I':
            current_car.append(tokens[idx])
        else:
            if found_car:
                found_car = False
                found_cars.append( u' '.join(current_car) )
                current_car = []
    if len(current_car) > 0:
        found_cars.append( u' '.join(current_car) )
    return found_cars
コード例 #8
0
ファイル: _nl_conll_ner.py プロジェクト: LourensVeen/xtas
def _train_ner_model():
    x_train, y_train, lengths_train = load_conll(_download_training_data(),
                                                 _features)

    clf = StructuredPerceptron()
    clf.fit(x_train, y_train, lengths_train)
    return clf
コード例 #9
0
def load_data():
    files = glob("../training data/new training/new ent 2/*.txt")#"../training data/new training/new_file*.txt")

    # 80% training, 20% test
    print("Loading training data...", end=" ")
    """train_files = [f for i, f in enumerate(files) if i % 5 != 0]
    train = load_conll(fileinput.input(train_files), features)#, split=True)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)"""
    
    
    train_files = [f for i, f in enumerate(files)]# if i % 5 != 0]
    print( train_files)
    train = load_conll(fileinput.input(train_files), features)#, split=True)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
  
    """test_files = [f for i, f in enumerate(glob("../training data/new training/new_test.txt"))]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)"""

    return train#, test
コード例 #10
0
def load_data():
    files = glob('nerdata/*.bio')

    # 80% training, 20% test
    print("Loading training data...", end=" ")
    train_files = [f for i, f in enumerate(files) if i % 5 != 0]
    train = load_conll(fileinput.input(train_files), features)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
    test_files = [f for i, f in enumerate(files) if i % 5 == 0]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test
コード例 #11
0
ファイル: hmm.py プロジェクト: ajvish91/CS5340-hwrkwe
def testHMM(clf, data):
    # Validation after training
    X_test, y_test, lengths_test = load_conll(data, features)
    y_pred = clf.predict(X_test, lengths_test)

    print y_pred
    # # Final score
    print(whole_sequence_accuracy(y_test, y_pred, lengths_test))
コード例 #12
0
ファイル: conll.py プロジェクト: 1oscar/seqlearn
def load_data():
    files = glob('nerdata/*.bio')

    # 80% training, 20% test
    print("Loading training data...", end=" ")
    train_files = [f for i, f in enumerate(files) if i % 5 != 0]
    train = load_conll(fileinput.input(train_files), features)
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
    test_files = [f for i, f in enumerate(files) if i % 5 == 0]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test
コード例 #13
0
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True,max_iter = 10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
コード例 #14
0
def test_load_conll():
    n_nonempty = sum(1 for ln in TEST_FILE.splitlines() if ln.strip())

    X, y, lengths = load_conll(six.moves.StringIO(TEST_FILE), features)
    assert_true(sp.isspmatrix(X))
    assert_equal(X.shape[0], n_nonempty)
    assert_equal(list(y),
                 ["Det", "N", "V", "Pre", "Det", "N", "Punc", "Adv", "Punc"])
    assert_array_equal(lengths, [7, 2])
コード例 #15
0
ファイル: test_datasets.py プロジェクト: 1oscar/seqlearn
def test_load_conll():
    n_nonempty = sum(1 for ln in TEST_FILE.splitlines() if ln.strip())

    X, y, lengths = load_conll(six.moves.StringIO(TEST_FILE), features)
    assert_true(sp.isspmatrix(X))
    assert_equal(X.shape[0], n_nonempty)
    assert_equal(list(y),
                 ["Det", "N", "V", "Pre", "Det", "N", "Punc",
                  "Adv", "Punc"])
    assert_array_equal(lengths, [7, 2])
コード例 #16
0
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(
        os.path.join(current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True, max_iter=10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(
        os.path.join(current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
コード例 #17
0
def load_data(trainingPath, testPath):
    #files = glob('nerdata/*.bio')

    #load training file and run through conll sequencer
    print("Training data loaded from {0}".format(trainingPath))
    #only doing glob because the example did above the print statement for glob
    trainFiles = glob(trainingPath)
    train_files = [f for i, f in enumerate(trainFiles)]
    train = load_conll(fileinput.input(train_files), features)
    #training data and description
    X_train, _, lengths_train = train
    describe(X_train, lengths_train)

    #Filler third column of testing data with fillers because load_conll wants 3 columns
    #We are only given 2 columns in test data, will not work without
    newTest = "newTest.txt"
    postTest = open(newTest, 'w+')
    with open(testPath) as file:
        for line in file:
            if (line != '\n'):
                #strip \n from end of file
                line = line.rstrip()
                #add filler to end of line
                postTest.write(line + '\t NonApplicable\n')
    postTest.close()

    #load test data
    print("Test data loaded from {0}".format(testPath))
    #again, only doing glob because the example did
    testFiles = glob(newTest)
    test_files = [f for i, f in enumerate(testFiles)]
    test = load_conll(fileinput.input(test_files), features)
    #test data and description
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test
コード例 #18
0
def main():
    #Load in training data and pass it through our feature function.
    #See documentation exact outputs of load_conll
    samples, labels, sentence_lengths = load_conll("data/gene-trainF18.txt",
                                                   features,
                                                   split=True)

    #Train the model with our features
    clf = StructuredPerceptron()
    clf.fit(samples, labels, sentence_lengths)

    #Evaluate our model
    test_samples, test_labels, test_sentence_lengths = load_conll(
        "data/F18-assgn4-test.txt", features, split=True)
    prediction = clf.predict(test_samples, test_sentence_lengths)

    #Output results
    i = 0
    j = 1
    output = []
    for line in open("data/F18-assgn4-test.txt"):
        if (line == "\n"):
            output.append("\n")
            j = 1
            continue
        else:
            item = str(
                j) + "\t" + line.split()[1] + "\t" + prediction[i] + "\n"
            output.append(item)
            print(item)
            i += 1
            j += 1

    with open('predictions.txt', 'w') as f:
        for item in output:
            f.write(item)
コード例 #19
0
def test_model():

    print("Loading test data...", end=" ")
    test_files = [f for i, f in enumerate(glob("prepard_test_1.txt"))]
    test = load_conll(fileinput.input(test_files),
                      sequence_learner_new.features)
    X_test, _, lengths_test = test
    sequence_learner_new.describe(X_test, lengths_test)

    X_test, y_test, lengths_test = test
    Y = sequence_learner_new.get_labels(y_test)
    clf = joblib.load('model/seq_labeler.pkl')
    y_pred = clf.predict(X_test, lengths_test)

    target = codecs.open("test_res.txt", "w", "utf-8")

    for i in range(0, X_test.shape[0]):
        target.write(y_pred[i] + "\n")

    prec = 0
    recall = 0
    count = 0
    entities = [
        "<PER>", "</PER>", "<PER></PER>", "<IPER>", "<ORG>", "</ORG>",
        "<ORG></ORG>", "<IORG>"
    ]
    for i in range(len(y_test)):
        elems = Y[i].partition("<")
        entity = elems[1] + elems[2]
        entity = entity.replace('\n', '')
        yelem = y_pred[i].partition("<")
        yent = yelem[1] + yelem[2]

        if entity in entities and Y[i].replace('\n', '') == y_pred[i]:
            print(entity)
            count += 1
            prec += 1

        elif yent in entities and Y[i] != y_pred[i]:
            count += 1

    if count > 0:
        print(count)
        print(" precision = ", (float(prec) / float(count)) * 100, "%")
    else:
        print(" precision = 0", prec)

    print("Accuracy: %.3f" % (100 * accuracy_score(Y, y_pred)))
コード例 #20
0
def main():
    print("Loading data")  #Useful messages
    dat = open(sys.argv[1])  # get filename and open the correct file
    addcol(dat, sys.argv[1])
    X_test, y_test, l_test = load_conll(
        "test.txt", features)  # load the test set created by addcol
    data = load_dat()  # yet another file loading function!
    X_train, y_train, l_train = load_con(data,
                                         features)  # the big loading file
    per = StructuredPerceptron(lr_exponent=0.35, max_iter=20,
                               verbose=1)  # Some trial and error found that
    # a lr of .35 and 20 iters worked best
    print("Fitting")
    per.fit(X_train, y_train, l_train)  # fit and predict
    y_p = per.predict(X_test, l_test)
    create_eval_file(y_p)  # save

    print("Done!")
コード例 #21
0
def test_model():

    print("Loading test data...", end=" ")
    test_files = [
        f for i, f in enumerate(
            glob("../training data/labelled test data/resume0.txt"))
    ]
    test = load_conll(fileinput.input(test_files), sequence_learner.features)
    X_test, _, lengths_test = test
    sequence_learner.describe(X_test, lengths_test)

    X_test, y_test, lengths_test = test

    clf = joblib.load('model/seq_labeler.pkl')
    y_pred = clf.predict(X_test, lengths_test)

    target = codecs.open("../training data/test data/test results/resume_res",
                         "w", "utf-8")

    for i in range(0, X_test.shape[0]):
        target.write(y_pred[i] + "\n")

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
コード例 #22
0
def test_load_conll_split():
    X, y, _ = load_conll(six.moves.StringIO(TEST_SPLIT),
                         features_split,
                         split=True)
    assert_equal(list(y), list("OBI"))
コード例 #23
0
ファイル: conll.py プロジェクト: AdityaTewari/seqlearn
    if i + 1 < len(sentence):
        yield "word+1:{}" + sentence[i + 1].lower()


def describe(X, lengths):
    print("{0} sequences, {1} tokens.".format(len(lengths), X.shape[0]))


if __name__ == "__main__":
    print(__doc__)

    if len(sys.argv) < 3:
        print("Usage: {0} training_file test_file".format(sys.argv[0]))
        sys.exit(1)

    print("Loading training data...", end=" ")
    X_train, y_train, lengths_train = load_conll(sys.argv[1], features)
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
    X_test, y_test, lengths_test = load_conll(sys.argv[2], features)
    describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, max_iter=10)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
コード例 #24
0
    if i < len(sequence) - 1:
        next_ = sequence[i + 1].split("\t")[0]
        # next word's length
        yield "next_len=" + str(get_word_len(next_))

        # last letters of the next word
        yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_)
        yield "next_short_word_shape=" + get_short_word_shape(next_)

    if i < len(sequence) - 2:
        nnext = sequence[i + 2].split("\t")[0]
        yield "nnext_short_word_shape=" + get_short_word_shape(nnext)


# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(
    open("finer-data/data/digitoday.2014.train.csv", "r"), features)

clf = StructuredPerceptron(decode="bestfirst", verbose=1, random_state=0)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on dev set")

# читаем отладочное множество
X_dev, y_dev, lengths_dev = load_conll(
    open("finer-data/data/digitoday.2014.dev.csv", "r"), features)
y_pred = clf.predict(X_dev, lengths_dev)

print("Whole seq accuracy    ",
      whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
コード例 #25
0
                f.write(token_tag[1][:-1]+'\t'+'O'+'\n')
    '''
    f = open('input_keys.txt', 'w')
    for line in open('test-run-test-with-keys.txt', 'r'):
        if line == '\n':
            f.write('\n')
        else:
            token_tag = line.split('\t')
            if len(token_tag) >= 3:
                f.write(token_tag[1] + '\t' + token_tag[2])
    '''
if __name__ == "__main__":
    print(__doc__)
    load_data()
    print("Loading training data...", end=" ")
    X_train, y_train, lengths_train = load_conll('input_train.txt', features)
    describe(X_train, lengths_train)


    print("Loading test data...", end=" ")
    X_test, y_test, lengths_test = load_conll('input_test.txt', features)
    describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, lr_exponent=0.1, max_iter=30)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    '''
    f = open('input_test_key.txt', 'w')
    for i in y_pred:
コード例 #26
0
        yield "prev_word_shape=" + get_word_shape(prev)
        yield "prev_short_word_shape=" + get_short_word_shape(prev)

    if i < len(sequence) - 1:
        next_ = sequence[i + 1].split("\t")[0]
        # next word's length
        yield "next_len=" + str(get_word_len(next_))

        # last letters of the next word
        yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_)
        yield "next_word_shape=" + get_word_shape(next_)
        yield "next_short_word_shape=" + get_short_word_shape(next_)


# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(
    open("ftb1u-v1/ftb1u_train.tsv", "r"), features)

clf = StructuredPerceptron(decode="viterbi", verbose=1)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on test set")

# читаем тестовое множество
X_test, y_test, lengths_test = load_conll(open("ftb1u-v1/ftb1u_test.tsv", "r"),
                                          features)
y_pred = clf.predict(X_test, lengths_test)
print("Whole seq accuracy    ",
      whole_sequence_accuracy(y_test, y_pred, lengths_test))
print("Element-wise accuracy ", accuracy_score(y_test, y_pred))
コード例 #27
0
    if i + 1 < len(sentence):
        yield "word+1:{}" + sentence[i + 1].lower()


def describe(X, lengths):
    print("{0} sequences, {1} tokens.".format(len(lengths), X.shape[0]))


if __name__ == "__main__":
    print(__doc__)

    if len(sys.argv) < 3:
        print("Usage: {0} training_file test_file".format(sys.argv[0]))
        sys.exit(1)

    print("Loading training data...", end=" ")
    X_train, y_train, lengths_train = load_conll(sys.argv[1], features)
    describe(X_train, lengths_train)

    print("Loading test data...", end=" ")
    X_test, y_test, lengths_test = load_conll(sys.argv[2], features)
    describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, max_iter=10)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
コード例 #28
0
    c = ch.join('   ')
    file.write(c)
    file.write('\n')

file.close()

clf = StructuredPerceptron()


def features(sequence, i):
    yield "word=" + sequence[i].lower()
    if sequence[i].isupper():
        yield "Uppercase"


X_train, y_train, lengths_train = load_conll("training_seq.txt", features)
clf = StructuredPerceptron()
clf.fit(X_train, y_train, lengths_train)
'''
names of the files that wanted to be test on
'''

predict_list_name = ["seq1.txt", "seq2.txt"]
predict_seq = []
output_list_name = ["prediction1.txt"]
count = 0

for name in predict_list_name:

    file = open(name, "r")
コード例 #29
0
        yield "prev_word_shape=" + get_word_shape(prev)
        yield "prev_short_word_shape=" + get_short_word_shape(prev)

    if i < len(sequence) - 1:
        next_ = sequence[i + 1].split("\t")[0]
        # next word's length
        yield "next_len=" + str(get_word_len(next_))

        # last letters of the next word
        yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_)
        yield "next_word_shape=" + get_word_shape(next_)
        yield "next_short_word_shape=" + get_short_word_shape(next_)


# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(
    open("resources/talbanken-stanford-1.2/talbanken-stanford-train.tsv", "r"), features)

clf = StructuredPerceptron(decode="viterbi", verbose=1, random_state=0)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on test set")

# читаем тестовое множество
X_test, y_test, lengths_test = load_conll(
    open("resources/talbanken-stanford-1.2/talbanken-stanford-test.tsv", "r"), features)
y_pred = clf.predict(X_test, lengths_test)
print("Whole seq accuracy    ", whole_sequence_accuracy(y_test, y_pred, lengths_test))
print("Element-wise accuracy ", accuracy_score(y_test, y_pred))
print("Mean F1-score macro   ", f1_score(y_test, y_pred, average="macro"))
コード例 #30
0
    if i < len(sequence) - 1:
        next = sequence[i + 1].split("\t")[1]
        # next word's length
        yield "next_len=" + str(get_word_len(next))

    if i < len(sequence) - 1:
        next = sequence[i + 1].split("\t")[1]
        # last letters of the next word
        yield "next_last_letters=" + (next[-3:] if len(next) > 3 else next)

    if i < len(sequence) - 1:
        next = sequence[i + 1].split("\t")[1]
        yield "next_short_word_shape=" + get_short_word_shape(next)

# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features)

clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on dev set")

# читаем отладочное множество
X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features)
y_pred = clf.predict(X_dev, lengths_dev)

print("Whole seq accuracy    ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
print("Element-wise accuracy ", accuracy_score(y_dev, y_pred))
print("Mean F1-score macro   ", f1_score(y_dev, y_pred, average="macro"))
コード例 #31
0
ファイル: test_datasets.py プロジェクト: 1oscar/seqlearn
def test_load_conll_split():
    X, y, _ = load_conll(six.moves.StringIO(TEST_SPLIT), features_split, split=True)
    assert_equal(list(y), list("OBI"))
コード例 #32
0
            yield "folUpper"
        if re.search(r"\d", nnp.lower()):
            yield "folNumber"
        yield "folword=" + nnp.lower()
    if p.isupper() and len(p) == 3:
        yield "Uppercase"
    if re.search(r"\d", p.lower()):
        yield "Number"
    if len(p) > 8:  # check if current word is unusually long
        yield "Long"


if __name__ == '__main__':
    train_path = "../Data/bio-ner/train"
    dev_path = "../Data/bio-ner/dev"

    # create_file(train_path, "train")
    # create_file(dev_path, "dev")

    X_train, y_train, l_train = load_conll("train", features)
    X_test, y_test, l_test = load_conll("dev", features)

    per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1)
    per.fit(X_train, y_train, l_train)

    y_p = per.predict(X_test, l_test)
    # for x in zip(y_p, y_test):
    #     print(x)

    print(bio_f_score(y_test, y_p))