def predict_label(words, masks, chars, predict_fn, alphabet_label): predict_list = [] for batch in utils.iterate_minibatches(words, masks=masks, char_inputs=chars): word_inputs, mask_inputs, char_inputs = batch predicts = predict_fn(word_inputs, mask_inputs, char_inputs) predict_list += utils.output_predictions(predicts, mask_inputs, alphabet_label) return predict_list
def train_model(num_data, batch_size, learning_rate, patience, decay_rate, X_train, Y_train, mask_train, C_train, X_dev, Y_dev, mask_dev, C_dev, X_test, Y_test, mask_test, C_test, input_var, target_var, mask_var, char_input_var, model, model_name, label_alphabet, output_dir): num_tokens = mask_var.sum(dtype=theano.config.floatX) energies_train = lasagne.layers.get_output(model) energies_eval = lasagne.layers.get_output(model, deterministic=True) loss_train = utils.crf_loss(energies_train, target_var, mask_var).mean() loss_eval = utils.crf_loss(energies_eval, target_var, mask_var).mean() _, corr_train = utils.crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = utils.crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) params = lasagne.layers.get_all_params(model, trainable=True) updates = lasagne.updates.momentum(loss_train, params=params, learning_rate=learning_rate, momentum=0.9) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) num_batches = num_data / batch_size num_epochs = 20 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate for epoch in range(1, num_epochs + 1): print('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] utils.output_predictions(predictions, targets, masks, output_dir + '/dev%d' % epoch, label_alphabet, is_flattened=False) print('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)) if model_name != 'pos': input = open(output_dir + '/dev%d' % epoch) p1 = subprocess.Popen(shlex.split("perl conlleval.pl"), stdin=input) p1.wait() if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] utils.output_predictions(predictions, targets, masks, output_dir + '/test%d' % epoch, label_alphabet, is_flattened=False) np.savez('pre-trained-model/' + model_name + '/weights', *lasagne.layers.get_all_param_values(model)) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)) if model_name != 'pos': input = open(output_dir + '/test%d' % epoch) p1 = subprocess.Popen(shlex.split("perl conlleval.pl"), stdin=input) p1.wait() if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease patience time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training lr = learning_rate / (1.0 + epoch * decay_rate) lasagne.updates.momentum(loss_train, params=params, learning_rate=lr, momentum=0.9) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print(best performance on test data.) print("final best loss test performance (at epoch %d)" % (best_epoch_loss)) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)) print("final best acc test performance (at epoch %d)" % (best_epoch_acc)) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total))
import numpy as np from utils import load_train_data, output_predictions, add_sex_bit, fill_fare, add_title, test_algo from sklearn.tree import DecisionTreeClassifier def logarize_fare(X): X['FareLog'] = np.log10(X['Fare'] + 1) df, X, Y = load_train_data([add_sex_bit, fill_fare, add_title, logarize_fare]) features = ['Pclass', 'SibSp', 'Parch', 'FareLog', 'SexBit', 'Title'] test_algo(DecisionTreeClassifier, X[features], Y, "Decision Tree with Title instead of Age and log(Fare+1)", { 'max_depth': 10, 'min_samples_leaf': 8 }) classifier = DecisionTreeClassifier(max_depth=10, min_samples_leaf=8) classifier.fit(X[features], Y) output_predictions(classifier, "06_submission.csv", [add_sex_bit, fill_fare, add_title, logarize_fare], features) print print "Importance of features:" print features print classifier.feature_importances_
text_field = pickle.load( open(f'{PATH}/{pretrained_lang_model_name}/TEXT.pkl', 'rb')) learner = get_text_classifier_model( text_field, LEVEL_LABEL, model_name=pretrained_lang_model_name + '_classifier', pretrained_lang_model_name=pretrained_lang_model_name) m = learner.model to_test_mode(m) # logging.info(f'Accuracy is {accuracy_np(*learner.predict_with_targs())}') with open(f'data/{pretrained_lang_model_name}/test/contexts.src', 'r') as f: counter = 0 for line in f: if counter > 30: break counter += 1 print(f'{counter}\n') output_predictions(m, text_field, LEVEL_LABEL, line, 3) back_to_train_mode(m, bs) #plotting confusion matrix #preds = np.argmax(probs, axis=1) # probs = probs[:,1] # from sklearn.metrics import confusion_matrix # cm = confusion_matrix(y, preds) # plot_confusion_matrix(cm, data.classes)
DecisionTreeClassifier, X, Y, "Decision Tree with max_depth=%d and min_samples_leaf=%d" % (max_depth, min_samples), { 'max_depth': max_depth, 'min_samples_leaf': min_samples }) print algo, options, name = best_classifier print "Best overall: %s with %.5f" % (name, best_score) # re-train best algo with whole training set classifier = algo(**options) classifier.fit(X, Y) output_predictions(classifier, '04_submission.csv', formatting_functions, features) plot_learning_curve(name, algo, options, X, Y, min_size=50, n_steps=50) print '=' * 100 print # Random forests best_score = 0.0 best_classifier = () test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees") test_algo(RandomForestClassifier, X, Y, "Random Forest with 50 trees", {'n_estimators': 50}) test_algo(RandomForestClassifier, X, Y, "Random Forest with 100 trees", {'n_estimators': 100})
verbose=1, param_grid=dict(C=[ 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0, 15.0, 30.0, 100.0 ])) grid.fit(X[features], Y) svm_score = grid.best_score_ svm_C = grid.best_params_['C'] print "Best parameters for SVM: C=%5g with score=%.5f" % (svm_C, svm_score) svm_classifier = SVC(C=svm_C, random_state=0) svm_classifier.fit(X[features], Y) output_predictions(svm_classifier, "06_submission_svm.csv", formatting, features) # Arbre "modèle 3" tree_classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state=0) tree_classifier.fit(X[features], Y) # Random Forest 200 arbres forest_classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6, random_state=0) forest_classifier.fit(X[features], Y)
'min_samples_leaf': 6, 'random_state': 2 }) # classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6) classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state=2) classifier.fit(X, Y) print "Importance of features:" print features1 print classifier.feature_importances_ print metrics.classification_report(Y.values, classifier.predict(X)) output_predictions(classifier, '05_submission_1.csv', formatting_functions1, features1) print # title instead of age (no embarked nor deck) formatting_functions2 = [add_sex_bit, fill_fare, add_title] features2 = ['Pclass', 'SibSp', 'Parch', 'Fare', 'SexBit', 'Title'] df, X, Y = load_train_data(formatting_functions2) X = X[features2] # test_algo(RandomForestClassifier, X, Y, "Random Forest with Title and no Age", # {'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2}) test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Title and no Age", { 'max_depth': 6, 'min_samples_leaf': 6,
print for min_samples in range(1,21): for max_depth in range(1,21): test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with max_depth=%d and min_samples_leaf=%d" % (max_depth, min_samples), {'max_depth': max_depth, 'min_samples_leaf': min_samples}) print algo, options, name = best_classifier print "Best overall: %s with %.5f" % (name, best_score) # re-train best algo with whole training set classifier = algo(**options) classifier.fit(X, Y) output_predictions(classifier, '04_submission.csv', formatting_functions, features) plot_learning_curve(name, algo, options, X, Y, min_size=50, n_steps=50) print '='*100 print # Random forests best_score = 0.0 best_classifier = () test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees") test_algo(RandomForestClassifier, X, Y, "Random Forest with 50 trees", {'n_estimators': 50}) test_algo(RandomForestClassifier, X, Y, "Random Forest with 100 trees", {'n_estimators': 100}) test_algo(RandomForestClassifier, X, Y, "Random Forest with 10 trees, max_depth=6 and min_samples_leaf=6", {'max_depth': 6, 'min_samples_leaf': 6})
#!/usr/bin/env python # -*- coding: UTF8 -*- import numpy as np from utils import load_train_data, output_predictions, add_sex_bit, fill_fare, add_title, test_algo from sklearn.tree import DecisionTreeClassifier def logarize_fare(X): X['FareLog'] = np.log10(X['Fare'] + 1) df, X, Y = load_train_data([add_sex_bit, fill_fare, add_title, logarize_fare]) features = ['Pclass', 'SibSp', 'Parch', 'FareLog', 'SexBit', 'Title'] test_algo(DecisionTreeClassifier, X[features], Y, "Decision Tree with Title instead of Age and log(Fare+1)", {'max_depth': 10, 'min_samples_leaf': 8}) classifier = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 8) classifier.fit(X[features], Y) output_predictions(classifier, "06_submission.csv", [add_sex_bit, fill_fare, add_title, logarize_fare], features) print print "Importance of features:" print features print classifier.feature_importances_
# {'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2}) test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Embarked, Deck and Title", {'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2}) # classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6) classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state=2) classifier.fit(X, Y) print "Importance of features:" print features1 print classifier.feature_importances_ print metrics.classification_report(Y.values, classifier.predict(X)) output_predictions(classifier, '05_submission_1.csv', formatting_functions1, features1) print # title instead of age (no embarked nor deck) formatting_functions2 = [add_sex_bit, fill_fare, add_title] features2 = ['Pclass', 'SibSp', 'Parch', 'Fare', 'SexBit', 'Title'] df, X, Y = load_train_data(formatting_functions2) X = X[features2] # test_algo(RandomForestClassifier, X, Y, "Random Forest with Title and no Age", # {'n_estimators': 200, 'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2}) test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with Title and no Age", {'max_depth': 6, 'min_samples_leaf': 6, 'random_state': 2}) # classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6)
# SVM, searching for best value for C kf = cross_validation.KFold(n=len(X), n_folds=8, indices=True) grid = GridSearchCV(estimator = SVC(random_state=0), cv = kf, verbose = 1, param_grid = dict(C=[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0, 15.0, 30.0, 100.0])) grid.fit(X[features], Y) svm_score = grid.best_score_ svm_C = grid.best_params_['C'] print "Best parameters for SVM: C=%5g with score=%.5f" % (svm_C, svm_score) svm_classifier = SVC(C = svm_C, random_state = 0) svm_classifier.fit(X[features], Y) output_predictions(svm_classifier, "06_submission_svm.csv", formatting, features) # Arbre "modèle 3" tree_classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6, random_state = 0) tree_classifier.fit(X[features], Y) # Random Forest 200 arbres forest_classifier = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=6, random_state = 0) forest_classifier.fit(X[features], Y) # Vote class VoteClassifier(object): def __init__(self, classifiers): self.classifiers = classifiers def predict(self, X):