Ejemplo n.º 1
0
def predict_label(words, masks, chars, predict_fn, alphabet_label):
    predict_list = []
    for batch in utils.iterate_minibatches(words, masks=masks, char_inputs=chars):
        word_inputs, mask_inputs, char_inputs = batch
        predicts = predict_fn(word_inputs, mask_inputs, char_inputs)
        predict_list += utils.output_predictions(predicts, mask_inputs, alphabet_label)
    return predict_list
Ejemplo n.º 2
0
def train_model(num_data, batch_size, learning_rate, patience, decay_rate,
                X_train, Y_train, mask_train, C_train, X_dev, Y_dev, mask_dev,
                C_dev, X_test, Y_test, mask_test, C_test, input_var,
                target_var, mask_var, char_input_var, model, model_name,
                label_alphabet, output_dir):
    num_tokens = mask_var.sum(dtype=theano.config.floatX)
    energies_train = lasagne.layers.get_output(model)
    energies_eval = lasagne.layers.get_output(model, deterministic=True)
    loss_train = utils.crf_loss(energies_train, target_var, mask_var).mean()
    loss_eval = utils.crf_loss(energies_eval, target_var, mask_var).mean()
    _, corr_train = utils.crf_accuracy(energies_train, target_var)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = utils.crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)
    params = lasagne.layers.get_all_params(model, trainable=True)
    updates = lasagne.updates.momentum(loss_train,
                                       params=params,
                                       learning_rate=learning_rate,
                                       momentum=0.9)
    train_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_train, corr_train, num_tokens],
        updates=updates)
    eval_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_eval, corr_eval, num_tokens, prediction_eval])
    num_batches = num_data / batch_size
    num_epochs = 20
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    for epoch in range(1, num_epochs + 1):
        print('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' %
              (epoch, lr, decay_rate))
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train,
                                               Y_train,
                                               masks=mask_train,
                                               char_inputs=C_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data, train_err
                / train_inst, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        sys.stdout.write("\b" * num_back)
        print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (min(train_batches * batch_size,
                   num_data), num_data, train_err / num_data,
               train_corr * 100 / train_total, time.time() - start_time))
        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        for batch in utils.iterate_minibatches(X_dev,
                                               Y_dev,
                                               masks=mask_dev,
                                               char_inputs=C_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            utils.output_predictions(predictions,
                                     targets,
                                     masks,
                                     output_dir + '/dev%d' % epoch,
                                     label_alphabet,
                                     is_flattened=False)
        print('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
              (dev_err / dev_inst, dev_corr, dev_total,
               dev_corr * 100 / dev_total))
        if model_name != 'pos':
            input = open(output_dir + '/dev%d' % epoch)
            p1 = subprocess.Popen(shlex.split("perl conlleval.pl"),
                                  stdin=input)
            p1.wait()
        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch
            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            test_inst = 0
            for batch in utils.iterate_minibatches(X_test,
                                                   Y_test,
                                                   masks=mask_test,
                                                   char_inputs=C_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, char_inputs = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                      char_inputs)
                test_err += err * inputs.shape[0]
                test_corr += corr
                test_total += num
                test_inst += inputs.shape[0]
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         output_dir + '/test%d' % epoch,
                                         label_alphabet,
                                         is_flattened=False)
            np.savez('pre-trained-model/' + model_name + '/weights',
                     *lasagne.layers.get_all_param_values(model))
            print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                  (test_err / test_inst, test_corr, test_total,
                   test_corr * 100 / test_total))
            if model_name != 'pos':
                input = open(output_dir + '/test%d' % epoch)
                p1 = subprocess.Popen(shlex.split("perl conlleval.pl"),
                                      stdin=input)
                p1.wait()
            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr
        # stop if dev acc decrease patience time straightly.
        if stop_count == patience:
            break
        # re-compile a function with new learning rate for training
        lr = learning_rate / (1.0 + epoch * decay_rate)
        lasagne.updates.momentum(loss_train,
                                 params=params,
                                 learning_rate=lr,
                                 momentum=0.9)
        train_fn = theano.function(
            [input_var, target_var, mask_var, char_input_var],
            [loss_train, corr_train, num_tokens],
            updates=updates)
    # print(best performance on test data.)
    print("final best loss test performance (at epoch %d)" % (best_epoch_loss))
    print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
          (best_loss_test_err / test_inst, best_loss_test_corr, test_total,
           best_loss_test_corr * 100 / test_total))
    print("final best acc test performance (at epoch %d)" % (best_epoch_acc))
    print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
          (best_acc_test_err / test_inst, best_acc_test_corr, test_total,
           best_acc_test_corr * 100 / test_total))
import numpy as np
from utils import load_train_data, output_predictions, add_sex_bit, fill_fare, add_title, test_algo
from sklearn.tree import DecisionTreeClassifier


def logarize_fare(X):
    X['FareLog'] = np.log10(X['Fare'] + 1)


df, X, Y = load_train_data([add_sex_bit, fill_fare, add_title, logarize_fare])

features = ['Pclass', 'SibSp', 'Parch', 'FareLog', 'SexBit', 'Title']

test_algo(DecisionTreeClassifier, X[features], Y,
          "Decision Tree with Title instead of Age and log(Fare+1)", {
              'max_depth': 10,
              'min_samples_leaf': 8
          })

classifier = DecisionTreeClassifier(max_depth=10, min_samples_leaf=8)
classifier.fit(X[features], Y)

output_predictions(classifier, "06_submission.csv",
                   [add_sex_bit, fill_fare, add_title, logarize_fare],
                   features)

print
print "Importance of features:"
print features
print classifier.feature_importances_
Ejemplo n.º 4
0
text_field = pickle.load(
    open(f'{PATH}/{pretrained_lang_model_name}/TEXT.pkl', 'rb'))
learner = get_text_classifier_model(
    text_field,
    LEVEL_LABEL,
    model_name=pretrained_lang_model_name + '_classifier',
    pretrained_lang_model_name=pretrained_lang_model_name)

m = learner.model
to_test_mode(m)

# logging.info(f'Accuracy is {accuracy_np(*learner.predict_with_targs())}')

with open(f'data/{pretrained_lang_model_name}/test/contexts.src', 'r') as f:
    counter = 0
    for line in f:
        if counter > 30:
            break
        counter += 1
        print(f'{counter}\n')
        output_predictions(m, text_field, LEVEL_LABEL, line, 3)

back_to_train_mode(m, bs)

#plotting confusion matrix
#preds = np.argmax(probs, axis=1)
# probs = probs[:,1]
# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(y, preds)
# plot_confusion_matrix(cm, data.classes)
Ejemplo n.º 5
0
            DecisionTreeClassifier, X, Y,
            "Decision Tree with max_depth=%d and min_samples_leaf=%d" %
            (max_depth, min_samples), {
                'max_depth': max_depth,
                'min_samples_leaf': min_samples
            })
print

algo, options, name = best_classifier
print "Best overall: %s with %.5f" % (name, best_score)

# re-train best algo with whole training set
classifier = algo(**options)
classifier.fit(X, Y)

output_predictions(classifier, '04_submission.csv', formatting_functions,
                   features)
plot_learning_curve(name, algo, options, X, Y, min_size=50, n_steps=50)

print '=' * 100
print

# Random forests

best_score = 0.0
best_classifier = ()

test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees")
test_algo(RandomForestClassifier, X, Y, "Random Forest with 50 trees",
          {'n_estimators': 50})
test_algo(RandomForestClassifier, X, Y, "Random Forest with 100 trees",
          {'n_estimators': 100})
                    verbose=1,
                    param_grid=dict(C=[
                        0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0,
                        10.0, 15.0, 30.0, 100.0
                    ]))
grid.fit(X[features], Y)

svm_score = grid.best_score_
svm_C = grid.best_params_['C']

print "Best parameters for SVM: C=%5g with score=%.5f" % (svm_C, svm_score)

svm_classifier = SVC(C=svm_C, random_state=0)
svm_classifier.fit(X[features], Y)

output_predictions(svm_classifier, "06_submission_svm.csv", formatting,
                   features)

# Arbre "modèle 3"
tree_classifier = DecisionTreeClassifier(max_depth=6,
                                         min_samples_leaf=6,
                                         random_state=0)
tree_classifier.fit(X[features], Y)

# Random Forest 200 arbres
forest_classifier = RandomForestClassifier(n_estimators=200,
                                           max_depth=6,
                                           min_samples_leaf=6,
                                           random_state=0)
forest_classifier.fit(X[features], Y)

Ejemplo n.º 7
0
              'min_samples_leaf': 6,
              'random_state': 2
          })

# classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6)
classifier = DecisionTreeClassifier(max_depth=6,
                                    min_samples_leaf=6,
                                    random_state=2)
classifier.fit(X, Y)

print "Importance of features:"
print features1
print classifier.feature_importances_
print metrics.classification_report(Y.values, classifier.predict(X))

output_predictions(classifier, '05_submission_1.csv', formatting_functions1,
                   features1)
print

# title instead of age (no embarked nor deck)
formatting_functions2 = [add_sex_bit, fill_fare, add_title]
features2 = ['Pclass', 'SibSp', 'Parch', 'Fare', 'SexBit', 'Title']

df, X, Y = load_train_data(formatting_functions2)
X = X[features2]

# test_algo(RandomForestClassifier, X, Y, "Random Forest with Title and no Age",
# 			{'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2})
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Title and no Age",
          {
              'max_depth': 6,
              'min_samples_leaf': 6,
print
for min_samples in range(1,21):
	for max_depth in range(1,21):
		test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with max_depth=%d and min_samples_leaf=%d" % 
			(max_depth, min_samples), {'max_depth': max_depth, 'min_samples_leaf': min_samples})
print


algo, options, name = best_classifier
print "Best overall: %s with %.5f" % (name, best_score)

# re-train best algo with whole training set
classifier = algo(**options)
classifier.fit(X, Y)

output_predictions(classifier, '04_submission.csv', formatting_functions, features)
plot_learning_curve(name, algo, options, X, Y, min_size=50, n_steps=50)

print '='*100
print

# Random forests

best_score = 0.0
best_classifier = ()

test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees")
test_algo(RandomForestClassifier, X, Y, "Random Forest with 50 trees", {'n_estimators': 50})
test_algo(RandomForestClassifier, X, Y, "Random Forest with 100 trees", {'n_estimators': 100})
test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees, max_depth=6 and min_samples_leaf=6", 
			{'max_depth': 6, 'min_samples_leaf': 6})
#!/usr/bin/env python
# -*- coding: UTF8 -*-

import numpy as np
from utils import load_train_data, output_predictions, add_sex_bit, fill_fare, add_title, test_algo
from sklearn.tree import DecisionTreeClassifier

def logarize_fare(X):
	X['FareLog'] = np.log10(X['Fare'] + 1)

df, X, Y = load_train_data([add_sex_bit, fill_fare, add_title, logarize_fare])

features = ['Pclass', 'SibSp', 'Parch', 'FareLog', 'SexBit', 'Title']

test_algo(DecisionTreeClassifier, X[features], Y, "Decision Tree with Title instead of Age and log(Fare+1)", {'max_depth': 10, 'min_samples_leaf': 8})

classifier = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 8)
classifier.fit(X[features], Y)

output_predictions(classifier, "06_submission.csv", [add_sex_bit, fill_fare, add_title, logarize_fare], features)

print
print "Importance of features:"
print features
print classifier.feature_importances_
# 			{'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2})

test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Embarked, Deck and Title", 
			{'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2})


# classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6)
classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state=2)
classifier.fit(X, Y)

print "Importance of features:"
print features1
print classifier.feature_importances_
print metrics.classification_report(Y.values, classifier.predict(X))

output_predictions(classifier, '05_submission_1.csv', formatting_functions1, features1)
print

# title instead of age (no embarked nor deck)
formatting_functions2 = [add_sex_bit, fill_fare, add_title]
features2 = ['Pclass', 'SibSp', 'Parch', 'Fare', 'SexBit', 'Title']

df, X, Y = load_train_data(formatting_functions2)
X = X[features2]

# test_algo(RandomForestClassifier, X, Y, "Random Forest with Title and no Age", 
# 			{'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2})
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Title and no Age", 
			{'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2})

# classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6)
# SVM, searching for best value for C
kf = cross_validation.KFold(n=len(X), n_folds=8, indices=True)
grid = GridSearchCV(estimator = SVC(random_state=0), cv = kf, verbose = 1,
					param_grid = dict(C=[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0, 15.0, 30.0, 100.0]))
grid.fit(X[features], Y)

svm_score = grid.best_score_
svm_C = grid.best_params_['C']

print "Best parameters for SVM: C=%5g with score=%.5f" % (svm_C, svm_score)

svm_classifier = SVC(C = svm_C, random_state = 0)
svm_classifier.fit(X[features], Y)

output_predictions(svm_classifier, "06_submission_svm.csv", formatting, features)

# Arbre "modèle 3"
tree_classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state = 0)
tree_classifier.fit(X[features], Y)

# Random Forest 200 arbres
forest_classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6, random_state = 0)
forest_classifier.fit(X[features], Y)

# Vote
class VoteClassifier(object):
	def __init__(self, classifiers):
		self.classifiers = classifiers

	def predict(self, X):