Esempio n. 1
0
def test_perceptron():
    X = [[0, 1, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 0, 1],
         [0, 0, 1],
         [0, 1, 0],
         [1, 0, 0],
         [1, 0, 0]]

    y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2]

    clf = StructuredPerceptron(verbose=False, random_state=37, max_iter=15)
    clf.fit(X, y, [len(y)])
    assert_array_equal(y, clf.predict(X))

    # Try again with string labels and sparse input.
    y_str = np.array(["eggs", "ham", "spam"])[y]

    clf = clone(clf)
    clf.fit(csc_matrix(X), y_str, [len(y_str)])
    assert_array_equal(y_str, clf.predict(coo_matrix(X)))

    X2 = np.vstack([X, X])
    y2 = np.hstack([y_str, y_str])
    assert_array_equal(y2, clf.predict(X2, lengths=[len(y), len(y)]))

    # Train with Viterbi, test with best-first to make StructuredPerceptron
    # behave a bit more like a linear model.
    clf.fit(X, y, [len(y)])
    clf.set_params(decode="bestfirst")
    y_linearmodel = np.dot(X, clf.coef_.T).argmax(axis=1)
    assert_array_equal(clf.predict(X), y_linearmodel)
def test_perceptron():
    X = [[0, 1, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 0, 1],
         [0, 0, 1],
         [0, 1, 0],
         [1, 0, 0],
         [1, 0, 0]]

    y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2]

    clf = StructuredPerceptron(verbose=False, random_state=37, max_iter=15)
    clf.fit(X, y, [len(y)])
    assert_array_equal(y, clf.predict(X))

    # Try again with string labels and sparse input.
    y_str = np.array(["eggs", "ham", "spam"])[y]

    clf = clone(clf)
    clf.fit(csc_matrix(X), y_str, [len(y_str)])
    assert_array_equal(y_str, clf.predict(coo_matrix(X)))

    X2 = np.vstack([X, X])
    y2 = np.hstack([y_str, y_str])
    assert_array_equal(y2, clf.predict(X2, lengths=[len(y), len(y)]))
Esempio n. 3
0
def test_perceptron():
    X = [[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1],
         [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]]

    y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2]

    clf = StructuredPerceptron(verbose=False, random_state=37, max_iter=15)
    clf.fit(X, y, [len(y)])
    assert_array_equal(y, clf.predict(X))

    # Try again with string labels and sparse input.
    y_str = np.array(["eggs", "ham", "spam"])[y]

    clf = clone(clf)
    clf.fit(csc_matrix(X), y_str, [len(y_str)])
    assert_array_equal(y_str, clf.predict(coo_matrix(X)))

    X2 = np.vstack([X, X])
    y2 = np.hstack([y_str, y_str])
    assert_array_equal(y2, clf.predict(X2, lengths=[len(y), len(y)]))
def test_perceptron_mask():
    X = [[0, 1, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 0, 1],
         [0, 0, 1],
         [0, 1, 0],
         [1, 0, 0],
         [1, 0, 0]]

    y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2]
    
    trans_constraints = [('spam','eggs'), ('spam', 'ham')]

    clf = StructuredPerceptron(verbose=True, random_state=42, max_iter=15,
                               trans_constraints=trans_constraints)

    # Try again with string labels and sparse input.
    y_str = np.array(["eggs", "ham", "spam"])[y]

    
    clf.fit(csc_matrix(X), y_str, [len(y_str)])
    
    # Still fits
    assert_array_equal(y_str, clf.predict(coo_matrix(X)))
    # Weights are overridden properly
    assert_array_equal([clf.intercept_trans_[2,0], clf.intercept_trans_[2,1]], 
                       [clf.CONSTRAINT_VALUE]*2)
                       
    # Add impossible constriants and model should fail to converge
    impossible_constraints = [('spam','eggs'), ('eggs', 'ham')]
    clf2 = StructuredPerceptron(verbose=True, random_state=12, max_iter=15,
                               trans_constraints=impossible_constraints)
    
    clf2.fit(csc_matrix(X), y_str, [len(y_str)])
    
    # Should raise error saying that prediction is incorrect
    assert_raises(AssertionError, assert_array_equal, y_str, clf2.predict(coo_matrix(X)))
Esempio n. 5
0
def test_perceptron():
    X = [[0, 1, 0],
         [0, 1, 0],
         [1, 0, 0],
         [0, 1, 0],
         [0, 0, 1],
         [0, 0, 1],
         [0, 1, 0],
         [1, 0, 0],
         [1, 0, 0]]

    y = [0, 0, 0, 0, 1, 1, 0, 2, 2]

    clf = StructuredPerceptron(verbose=True).fit(X, y, [len(y)])
    assert_array_equal(y, clf.predict(X))
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True,max_iter = 10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
Esempio n. 7
0
def main():
    print("Loading data")  #Useful messages
    dat = open(sys.argv[1])  # get filename and open the correct file
    addcol(dat, sys.argv[1])
    X_test, y_test, l_test = load_conll(
        "test.txt", features)  # load the test set created by addcol
    data = load_dat()  # yet another file loading function!
    X_train, y_train, l_train = load_con(data,
                                         features)  # the big loading file
    per = StructuredPerceptron(lr_exponent=0.35, max_iter=20,
                               verbose=1)  # Some trial and error found that
    # a lr of .35 and 20 iters worked best
    print("Fitting")
    per.fit(X_train, y_train, l_train)  # fit and predict
    y_p = per.predict(X_test, l_test)
    create_eval_file(y_p)  # save

    print("Done!")
Esempio n. 8
0
def train_model():
    print "Loading training data..."
    X_train, y_train, lengths_train = load_conll(
        os.path.join(current_dir, "./../data/train.conll"), features)
    clf = StructuredPerceptron(verbose=True, max_iter=10)
    describe(X_train, lengths_train)

    print "Loading test data..."
    X_test, y_test, lengths_test = load_conll(
        os.path.join(current_dir, "./../data/test.conll"), features)
    describe(X_test, lengths_test)

    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)

    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
    return clf
Esempio n. 9
0
def main():
    #Load in training data and pass it through our feature function.
    #See documentation exact outputs of load_conll
    samples, labels, sentence_lengths = load_conll("data/gene-trainF18.txt",
                                                   features,
                                                   split=True)

    #Train the model with our features
    clf = StructuredPerceptron()
    clf.fit(samples, labels, sentence_lengths)

    #Evaluate our model
    test_samples, test_labels, test_sentence_lengths = load_conll(
        "data/F18-assgn4-test.txt", features, split=True)
    prediction = clf.predict(test_samples, test_sentence_lengths)

    #Output results
    i = 0
    j = 1
    output = []
    for line in open("data/F18-assgn4-test.txt"):
        if (line == "\n"):
            output.append("\n")
            j = 1
            continue
        else:
            item = str(
                j) + "\t" + line.split()[1] + "\t" + prediction[i] + "\n"
            output.append(item)
            print(item)
            i += 1
            j += 1

    with open('predictions.txt', 'w') as f:
        for item in output:
            f.write(item)
def perceptronTest(input_data, test, actual, actual8):
    model = StructuredPerceptron(verbose=False, random_state=37, max_iter=1000)

    l = 9 * len(input_data) / 10 - 1
    scores = cross_val_score(model,
                             input_data.iloc[:, :-1],
                             input_data.iloc[:, -1],
                             cv=10,
                             fit_params={'lengths': [l]})

    print 'Cross Validation Accuracy = ' + str(scores.mean())

    model.fit(input_data.iloc[:, :-1], input_data.iloc[:, -1],
              [len(input_data)])
    pred = model.predict(test)
    accuracy = sum(pred == actual) / float(len(actual))
    spred = smoothening(pred)
    saccuracy = sum(spred == actual) / float(len(actual))
    pred8 = mergePredictions(pred, 8)
    accuracy8 = sum(pred8 == actual8) / float(len(actual8))
    print 'Test Accuracy for the subject is = ' + str(accuracy)
    print 'Test Accuracy after smoothening for the subject is = ' + str(
        saccuracy)
    print 'Test Accuracy for the subject at step 8 is = ' + str(accuracy8)
Esempio n. 11
0
    alldata = [np.array([])] * 63
    for i in range(0, 63):
        col = test_df.iloc[:, i].values
        col_uni = np.unique(col[~np.isnan(col)])
        alldata[i] = np.concatenate((alldata[i], col_uni), axis=0)
    ipdb.set_trace()
    np.savetxt("test_random.txt", alldata)
    assert 0
    """
    '''
        for 5 year prediction
    '''
    # hmm = MultinomialHMM()
    hmm = StructuredPerceptron()
    hmm.fit(train_df, tr_label_5y["label_numbers"], train_seqlength)
    pred = hmm.predict(test_df, test_seqlength)
    # print(roc_auc_score(te_label_5y["label_numbers"], pred, average="macro"))

    onehot_encoder = OneHotEncoder(sparse=False)
    pred = pred.reshape(len(pred), 1)
    pred = onehot_encoder.fit_transform(pred)
    label = te_label_5y["label_numbers"].values.reshape(
        len(te_label_5y["label_numbers"]), 1)
    label = onehot_encoder.fit_transform(label)

    auc_per_class = np.zeros((5, ))
    for i in range(5):
        pred_temp = pred[:, i]
        label_temp = label[:, i]
        score = roc_auc_score(label_temp, pred_temp)
        auc_per_class[i] = score
# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(
    open("finer-data/data/digitoday.2014.train.csv", "r"), features)

clf = StructuredPerceptron(decode="bestfirst", verbose=1, random_state=0)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on dev set")

# читаем отладочное множество
X_dev, y_dev, lengths_dev = load_conll(
    open("finer-data/data/digitoday.2014.dev.csv", "r"), features)
y_pred = clf.predict(X_dev, lengths_dev)

print("Whole seq accuracy    ",
      whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
print("Element-wise accuracy ", accuracy_score(y_dev, y_pred))
print("Mean F1-score macro   ", f1_score(y_dev, y_pred, average="macro"))
print(classification_report(y_dev, y_pred))

print(pd.Series(y_pred).value_counts())

print("\nPredictions on test set")

# читаем тестовое множество
X_test, y_test, lengths_test = load_conll(
    open("finer-data/data/digitoday-fixed.2015.test.csv", "r"), features)
y_pred = clf.predict(X_test, lengths_test)
Esempio n. 13
0
    testing = "test-run-test.txt"
    outputFile = "assign4-output.txt"

    train, test = load_data(training, testing)
    X_train, y_train, lengths_train = train
    X_test, y_test, lengths_test = test
    score = 0
    for i in range(30):

        # train data sequence with perceptron in example
        clf = StructuredPerceptron(verbose=True, max_iter=10)
        print("Training %s" % clf)
        clf.fit(X_train, y_train, lengths_train)

        # predicted IOB tags sequence from test data
        y_pred = clf.predict(X_test, lengths_test)

        outputCopy = open(outputFile, "w+")
        counter = 0
        with open(testing) as file:
            for line in file:
                if line != "\n":
                    line = line.rstrip()
                    outputCopy.write(line + "\t" + y_pred[counter] + "\n")
                    counter += 1
                else:
                    outputCopy.write("\n")
        outputCopy.close()

        # Martin's evalNER score script
        outputReal = open(outputFile, "r")
Esempio n. 14
0
    test_files = [f for i, f in enumerate(files) if i % 5 == 0]
    test = load_conll(fileinput.input(test_files), features)
    X_test, _, lengths_test = test
    describe(X_test, lengths_test)

    return train, test


if __name__ == "__main__":
    print(__doc__)

    #print("Loading training data...", end=" ")
    #X_train, y_train, lengths_train = load_conll(sys.argv[1], features)
    #describe(X_train, lengths_train)

    train, test = load_data()
    X_train, y_train, lengths_train = train
    X_test, y_test, lengths_test = test

    #print("Loading test data...", end=" ")
    #X_test, y_test, lengths_test = load_conll(sys.argv[2], features)
    #describe(X_test, lengths_test)

    clf = StructuredPerceptron(verbose=True, max_iter=10)
    print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
    print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
Esempio n. 15
0
class MLVSA(object):
    def __init__(self, inst_train, label_train, inst_test, label_test, seq_len,
                 model_option):

        self.seq_len = seq_len
        self.model_option = model_option
        self.X_train, self.Y_train, self.n_class = self.list2np(
            inst_train, label_train, seq_len)
        self.X_test, self.Y_test, _, = self.list2np(inst_test, label_test,
                                                    seq_len)

    def predict_classes(self, proba):
        if proba.shape[-1] > 1:
            return proba.argmax(axis=-1)
        else:
            return (proba > 0.5).astype('int32')

    def list2np(self, inst, label, seq_len):
        label[label == 10] = 0
        label[label == 20] = 1
        label[label == 30] = 2
        label[label == 40] = 3
        n_class = 4

        num_sample = inst.shape[0] / seq_len
        X = inst[0:(num_sample * seq_len), ].reshape(num_sample, seq_len)
        Y = label[0:(num_sample * seq_len), ].reshape(num_sample, seq_len)

        return X, Y, n_class

    def fit(self):
        self.X_train = self.X_train
        self.Y_train = self.Y_train
        self.X_test = self.X_test
        self.Y_test = self.Y_test

        print '================================================'
        print "Data shape..."
        print self.X_train.shape
        print self.Y_train.shape
        print self.X_test.shape
        print self.Y_test.shape
        print "Counting the number of data in each category..."
        print collections.Counter(self.Y_train.flatten())
        print collections.Counter(self.Y_test.flatten())
        print '================================================'
        if self.model_option == 0:
            print "Using SVM >>>>>>>>>>>>>>>>>>>>>>>"
            self.model = svm.SVC(kernel='rbf', decision_function_shape='ovo')
            self.y_pred = np.zeros_like(self.Y_test)
            for i in xrange(self.seq_len):
                self.model.fit(self.X_train[:, i].reshape(-1, 1),
                               self.Y_train[:, i].reshape(-1, 1))
                self.y_pred[:, i] = self.model.predict(self.X_test[:,
                                                                   i].reshape(
                                                                       -1, 1))

        elif self.model_option == 1:
            print "Using RF >>>>>>>>>>>>>>>>>>>>>>>>"
            self.model = rf(n_estimators=100)
            self.y_pred = np.zeros_like(self.Y_test)
            for i in xrange(self.seq_len):
                self.model.fit(self.X_train[:, i].reshape(-1, 1),
                               self.Y_train[:, i].reshape(-1, 1))
                self.y_pred[:, i] = self.model.predict(self.X_test[:,
                                                                   i].reshape(
                                                                       -1, 1))

        elif self.model_option == 2:
            print "Using HMM >>>>>>>>>>>>>>>>>>>>>>>"
            self.X_train = self.X_train.flatten()
            self.Y_train = self.Y_train.flatten()
            self.X_train = self.X_train.reshape(self.X_train.shape[0], 1)

            self.X_test = self.X_test.flatten()
            self.Y_test = self.Y_test.flatten()
            self.X_test = self.X_test.reshape(self.X_test.shape[0], 1)

            self.model = StructuredPerceptron()
            oh = onehot()
            self.X_train = oh.fit_transform(self.X_train)
            self.X_test = oh.transform(self.X_test)
            num_seq = self.X_train.shape[0] / self.seq_len
            length_train = [self.seq_len] * num_seq
            self.model.fit(self.X_train, self.Y_train, length_train)

            num_seq = self.X_test.shape[0] / self.seq_len
            length_test = [self.seq_len] * num_seq
            self.y_pred = self.model.predict(self.X_test, length_test)

        elif self.model_option == 3:
            print "Using CRF >>>>>>>>>>>>>>>>>>>>>>>"

            trainer = pycrfsuite.Trainer(verbose=True)
            for i in xrange(self.X_train.shape[0]):
                trainer.append(self.X_train[i, ].astype('string'),
                               self.Y_train[i, ].astype('string'))

            trainer.set_params({
                'c1': 0.1,
                'c2': 0.01,
                'max_iterations': 2000,
                'feature.possible_transitions': True
            })

            trainer.train('crf.model')

            tagger = pycrfsuite.Tagger()
            tagger.open('crf.model')
            self.y_pred = []
            for i in xrange(self.X_test.shape[0]):
                self.y_pred.append(
                    np.array(tagger.tag(
                        self.X_test[i, ].astype('string'))).astype('int'))
            self.y_pred = np.array(self.y_pred)

        print 'Evaluating testing results'
        precision, recall, f1, _ = precision_recall_fscore_support(
            self.Y_test.flatten(),
            self.y_pred.flatten(),
            labels=[0, 1, 2, 3],
            average='weighted')
        print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1))
        print '================================================'

        for i in xrange(4):
            print 'Evaluating testing results of positive labels at region ' + str(
                i)
            precision, recall, f1, _ = precision_recall_fscore_support(
                self.Y_test.flatten(),
                self.y_pred.flatten(),
                labels=[i],
                average='weighted')
            print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1))
            print '================================================'

        return self.y_pred
Esempio n. 16
0
            yield "folUpper"
        if re.search(r"\d", nnp.lower()):
            yield "folNumber"
        yield "folword=" + nnp.lower()
    if p.isupper() and len(p) == 3:
        yield "Uppercase"
    if re.search(r"\d", p.lower()):
        yield "Number"
    if len(p) > 8:  # check if current word is unusually long
        yield "Long"


if __name__ == '__main__':
    train_path = "../Data/bio-ner/train"
    dev_path = "../Data/bio-ner/dev"

    # create_file(train_path, "train")
    # create_file(dev_path, "dev")

    X_train, y_train, l_train = load_conll("train", features)
    X_test, y_test, l_test = load_conll("dev", features)

    per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1)
    per.fit(X_train, y_train, l_train)

    y_p = per.predict(X_test, l_test)
    # for x in zip(y_p, y_test):
    #     print(x)

    print(bio_f_score(y_test, y_p))
Esempio n. 17
0
kf = KFold(p.L.shape[0], n_folds=2)

for train_ids, test_ids in kf:

    L_train = p.L[train_ids]
    L_test = p.L[test_ids]
    X_train = np.zeros((0, p.X.shape[1]))
    y_train = np.zeros((0,))
    X_test = np.zeros((0, p.X.shape[1]))
    y_test = np.zeros((0,))

    for i, l in enumerate(L_train):
        start = sum(L_train[:i])
        end = sum(L_train[:i+1])
        X_train = np.vstack([X_train, p.X[start:end]])
        y_train = np.append(y_train, p.Y[start:end])

    for i, l in enumerate(L_test):
        start = sum(L_test[:i])
        end = sum(L_test[:i+1])
        X_test = np.vstack([X_test, p.X[start:end]])
        y_test = np.append(y_test, p.Y[start:end])

    clf = StructuredPerceptron()
    clf.fit(X_train, y_train, L_train)

    y_pred = clf.predict(X_test, L_test)

    print("The Bio Score for Kfold = %s" % bio_f_score(y_test, y_pred))

        next = sequence[i + 1].split("\t")[1]
        yield "next_short_word_shape=" + get_short_word_shape(next)

# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features)

clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on dev set")

# читаем отладочное множество
X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features)
y_pred = clf.predict(X_dev, lengths_dev)

print("Whole seq accuracy    ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
print("Element-wise accuracy ", accuracy_score(y_dev, y_pred))
print("Mean F1-score macro   ", f1_score(y_dev, y_pred, average="macro"))

print("\nPredictions on test set")

# читаем тестовое множество
X_test, _, lengths_test = load_conll(open("../resources/test.data", "r"), features)
y_pred = clf.predict(X_test, lengths_test)

print(pd.Series(y_pred).value_counts())

print("Saving predicted as a submission")
Esempio n. 19
0
sub1_X2=sub1_2['X']
sub1_Y2= sub1_2['Y']

sub1_3=scipy.io.loadmat('train_subject1_psd03.mat')
sub1_X3=sub1_3['X']
sub1_Y3= sub1_3['Y']

sub1_X=np.concatenate((sub1_X1, sub1_X2, sub1_X3), axis=0)
sub1_Y=np.concatenate((sub1_Y1, sub1_Y2,sub1_Y3), axis=0)

sub1_clf = StructuredPerceptron(decode='viterbi', lr_exponent=0.1, max_iter=10000, random_state=None, trans_features=False, verbose=0)
sub1_clf.fit(sub1_X, sub1_Y,[len(sub1_Y)])

sub1_test=scipy.io.loadmat('test_subject1_psd04.mat')
sub1_X4=sub1_test['X']
sub1_predicted=sub1_clf.predict(sub1_X4)
sub1_Y4=np.loadtxt('test_subject1_true_label.csv',delimiter=",")

print 'subject-1',accuracy_score(sub1_predicted, sub1_Y4)
print confusion_matrix(sub1_Y4, sub1_predicted)
#3017/3504 : subject 1 accuracy

start subject-2
sub2_1=scipy.io.loadmat('train_subject2_psd01.mat')
sub2_X1=sub2_1['X']
sub2_Y1= sub2_1['Y']

sub2_2=scipy.io.loadmat('train_subject2_psd02.mat')
sub2_X2=sub2_2['X']
sub2_Y2= sub2_2['Y']
Esempio n. 20
0
class SeqModel():

    def __init__(self, input_encoding, conversion_key, n_iter_seq):
        self.input_encoding = input_encoding
        self.conversion_key = conversion_key
        
        # self.model = MultinomialHMM()
        self.model = StructuredPerceptron(max_iter=n_iter_seq)

    def train(self, trainset):
        print("Training ...")
        start_time = time.time()
        plot_losses = []
        plot_distances = []
        
        X_train, Y_train, lengths_train = trainset.get_set()
        
        # Perform training
        self.model.fit(X_train, Y_train, lengths=lengths_train)
        
        duration = time.time() - start_time
        print("Duration = {0:.2f}".format(duration))
        
        return plot_losses, plot_distances
    
    def _compute_distance(self, X, Y, predictions=None, word_lengths=None):
        distances_t_p = []
        distances_s_t = []
        input_words = []
        target_words = []
        predicted_words = []
        # X, Y and predictions are long lists of characters. Split into words again.
        if predictions is not None:
            predictions = data.split_predictions(predictions, word_lengths)
        X = data.split_predictions(X, word_lengths)
        Y = data.split_predictions(Y, word_lengths)
        for ex in np.arange(len(X)):
            # X is encoded and has to be decoded
            _, input_tokens = data.word_surface(X[ex], self.conversion_key[0], self.input_encoding)
            # Y is already in surface form
            #target_word = "".join(Y[ex])
            target_tokens = Y[ex]
            
            input_cut = [t for t in input_tokens if t != "."]
            target_cut = [t for t in target_tokens if t != "."]
            if predictions is not None:
                # Predictions are already in surface form
                #predicted_word = "".join(predictions[ex])
                predicted_tokens = predictions[ex]
                predicted_cut = [t for t in predicted_tokens if t != "."]
                dist_t_p = utility.calculate_levenshtein(target_cut, predicted_cut)
                distances_t_p.append(dist_t_p)
                predicted_words.append(predicted_cut)
            dist_s_t = utility.calculate_levenshtein(input_cut, target_cut)
            distances_s_t.append(dist_s_t)
            input_words.append(input_cut)
            target_words.append(target_cut)
        if predictions is not None:
            return input_words, target_words, predicted_words, distances_t_p, distances_s_t
        else:
            return np.mean(distances_s_t)

    def predict(self, testset, print_output=True):
        all_distances_t_p = []
        all_distances_s_t = []
        all_input_words = []
        all_target_words = []
        all_predicted_words = []
        
        if print_output:
            text_output = ""
            header_template = "{0:20} {1:20} {2:20} {3:8}"
            template = "{0:20} {1:20} {2:20} {3:.2f}"
            text_output += header_template.format("INPUT", "TARGET", "PREDICTION", "DISTANCE") + "\n"

        # Fetch whole test set in format suitable for seqmodel
        X_test, Y_test, lengths_test = testset.get_set()
        predictions = self.model.predict(X_test, lengths=lengths_test)
        input_words, target_words, predicted_words, distances_t_p, distances_s_t = self._compute_distance(X_test, Y_test, predictions, lengths_test)
        all_distances_t_p += distances_t_p
        all_distances_s_t += distances_s_t
        all_input_words += input_words
        all_target_words += target_words
        all_predicted_words += predicted_words
            
        row_dict = defaultdict(list)
        for i in np.arange(len(all_input_words)):
            input_word = all_input_words[i]
            target_word = all_target_words[i]
            predicted_word = all_predicted_words[i]
            dist_t_p = all_distances_t_p[i]
            dist_s_t = all_distances_s_t[i]
            if print_output:
                text_output += template.format("".join(input_word), "".join(target_word), "".join(predicted_word), dist_t_p) + "\n"
            row_dict["INPUT"].append(" ".join(input_word))
            row_dict["TARGET"].append(" ".join(target_word))
            row_dict["PREDICTION"].append(" ".join(predicted_word))
            row_dict["DISTANCE_T_P"].append(dist_t_p)
            row_dict["DISTANCE_S_T"].append(dist_s_t)
            
            # Get information from datafile
            records = testset.get_datafile_record(i)
            row_dict["CONCEPT"].append(records[0].iloc[0]["CONCEPT"])
            # Add columns for cognate judgments of both word1 and word2
            for ix in [0, 1]:
                if "COGNATES_LEXSTAT" in records[ix]:
                    row_dict["COGNATES_LEXSTAT" + str(ix)].append(records[ix].iloc[0]["COGNATES_LEXSTAT"])
                if "COGNATES_IELEX" in records[ix]:
                    row_dict["COGNATES_IELEX" + str(ix)].append(records[ix].iloc[0]["COGNATES_IELEX"])

        avg_distance = np.average(all_distances_t_p)
        if print_output:
            text_output += "Average distance: " + str(avg_distance) + "\n"
            print(text_output)
        results_table = pd.DataFrame(row_dict)
        return avg_distance, results_table