def test_accuracy(): y_true = ["0111001", "1001", "00011111", "010101011", "1110"] y_pred = ["0010010", "1001", "00011110", "010101011", "1110"] assert_equal( .6, whole_sequence_accuracy(''.join(y_true), ''.join(y_pred), [len(y) for y in y_true]))
def test_accuracy(): y_true = ["0111001", "1001", "00011111", "010101011", "1110"] y_pred = ["0010010", "1001", "00011110", "010101011", "1110"] assert_equal( .6, whole_sequence_accuracy(''.join(y_true), ''.join(y_pred), map(len, y_true)))
def testHMM(clf, X_test, y_test): # Validation after training y_pred = clf.predict(X_test, [len(y_test)]) print y_pred # # Final score print(whole_sequence_accuracy(y_test, y_pred, [len(y_test)]))
def testHMM(clf, data): # Validation after training X_test, y_test, lengths_test = load_conll(data, features) y_pred = clf.predict(X_test, lengths_test) print y_pred # # Final score print(whole_sequence_accuracy(y_test, y_pred, lengths_test))
open("finer-data/data/digitoday.2014.train.csv", "r"), features) clf = StructuredPerceptron(decode="bestfirst", verbose=1, random_state=0) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll( open("finer-data/data/digitoday.2014.dev.csv", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro")) print(classification_report(y_dev, y_pred)) print(pd.Series(y_pred).value_counts()) print("\nPredictions on test set") # читаем тестовое множество X_test, y_test, lengths_test = load_conll( open("finer-data/data/digitoday-fixed.2015.test.csv", "r"), features) y_pred = clf.predict(X_test, lengths_test) print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test)) print("Element-wise accuracy ", accuracy_score(y_test, y_pred))
# читаем обучающее множество X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features) clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro")) print("\nPredictions on test set") # читаем тестовое множество X_test, _, lengths_test = load_conll(open("../resources/test.data", "r"), features) y_pred = clf.predict(X_test, lengths_test) print(pd.Series(y_pred).value_counts()) print("Saving predicted as a submission") with open("submission.csv", "w") as wf: wf.write("id,tag\n")
def test_accuracy(): y_true = ["0111001", "1001", "00011111", "010101011", "1110"] y_pred = ["0010010", "1001", "00011110", "010101011", "1110"] assert_equal(.6, whole_sequence_accuracy(''.join(y_true), ''.join(y_pred), [len(y) for y in y_true]))
# next word's length yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_word_shape=" + get_word_shape(next_) yield "next_short_word_shape=" + get_short_word_shape(next_) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("resources/talbanken-stanford-1.2/talbanken-stanford-train.tsv", "r"), features) clf = StructuredPerceptron(decode="viterbi", verbose=1, random_state=0) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on test set") # читаем тестовое множество X_test, y_test, lengths_test = load_conll( open("resources/talbanken-stanford-1.2/talbanken-stanford-test.tsv", "r"), features) y_pred = clf.predict(X_test, lengths_test) print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test)) print("Element-wise accuracy ", accuracy_score(y_test, y_pred)) print("Mean F1-score macro ", f1_score(y_test, y_pred, average="macro")) print(classification_report(y_test, y_pred)) print(pd.Series(y_pred).value_counts())
print("Running fold %d for set %d" % (cv, set)) clf=hmm.GMMHMM(n_components=2,n_mix=4,n_iter=100) clf.fit(x, train_lengths) pred = [row for row in clf.predict(tx, test_lengths)] pred_last = [] ty_last = [] length_count = 0 for i in range(0, len(test_lengths)): length_count += test_lengths[i] pred_last.append(pred[length_count - 1]) ty_last.append(ty[length_count-1]) hmm_pred.append(pred) acc_ws_0 = whole_sequence_accuracy(ty, pred, test_lengths) acc_last_0 = accuracy_score(ty_last, pred_last) mcc_ws_0 = matthews_corrcoef(ty, pred) mcc_last_0 = matthews_corrcoef(ty_last, pred_last) acc_ws_1 = whole_sequence_accuracy([(z + 1)%2 for z in ty], pred, test_lengths) acc_last_1 = accuracy_score([(z + 1)%2 for z in ty_last], pred_last) mcc_ws_1 = matthews_corrcoef([(z + 1)%2 for z in ty], pred) mcc_last_1 = matthews_corrcoef([(z + 1)%2 for z in ty_last], pred_last) if acc_last_0 > acc_last_1: acc_ws = acc_ws_0 acc_last = acc_last_0 mcc_ws = mcc_ws_0 mcc_last = mcc_last_0
def test_accuracy(): y_true = ["0111001", "1001", "00011111", "010101011", "1110"] y_pred = ["0010010", "1001", "00011110", "010101011", "1110"] assert_equal(.6, whole_sequence_accuracy(''.join(y_true), ''.join(y_pred), map(len, y_true)))