def final_clf_training(Xs, ys, X_holdout, y_holdout, scorer_type, sanity_check=False): """ Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data :param Xs: All the training data's feature vectors, split in their folds :param ys: All the training data's labels, split in their folds :param X_holdout: The holdout feature vectors :param y_holdout: The holdout labels :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder) :param sanity_check: If true, the trained classifier predicts the labels of the data it was trained on and prints out the score :return: the final classifier """ # stack all the feature vectors of all the folds X_train = np.vstack(tuple([Xs[i] for i in range(10)])) y_train = np.hstack(tuple([ys[i] for i in range(10)])) # stack the holdout feature vectors on the feature vectors of all folds X_all = np.concatenate([X_train, X_holdout], axis=0) y_all = np.concatenate([y_train, y_holdout], axis=0) # define and create parent folder to save all trained classifiers into parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(path.dirname(path.abspath(__file__)))) # create the new save folder for the specific classifer scorer_folder_name = scorer_type+"_final" save_folder = get_save_folder(parent_folder, scorer_folder_name+"_new") # get classifier and only pass a save folder if the classifier should be saved clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder) # fit the final classifier clf.fit(X_all, y_all) # save the model filename = scorer_folder_name + ".sav" save_model(clf, save_folder, filename) # save model with filename to specific folder # predict on the data the classifier was trained on => should give near perfect score if sanity_check == True: # get predicted and actual labels y_predicted = clf.predict(X_all) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_all] # calc FNC score fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calc accuracy, f1 macro accuracy_stance = score_calculation.get_accuracy(y_predicted, y_all, stance=True) accuracy_related = score_calculation.get_accuracy(y_predicted, y_all, stance=False) f1_stance = score_calculation.get_f1score(y_predicted, y_all, stance=True) f1_related = score_calculation.get_f1score(y_predicted, y_all, stance=False) # printout results printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print("SANITY CHECK (predict on train data):") print(printout) return clf
def validate_holdout(Xs, ys, X_holdout, y_holdout, non_bleeding_features, features_dir, scorer_type, feat_indices, result_string, learning_rate_string): """ Trains the classifier on all of the train+test data and tests it on the holdout set :param Xs: All the training data's feature vectors, split in their folds :param ys: All the training data's labels, split in their folds :param X_holdout: The holdout feature vectors :param y_holdout: The holdout labels :param non_bleeding_features: The list of non-bleeding features that has to be concatenated to the existing feature vectors :param features_dir: the directory where the features are stored :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder) :param feat_indices: indices returned by generate_features() method. They indicate at what index of the feature vector a specific feature starts and where it ends. This is used for printing out the feature importances by the RandomForest classifier :param result_string: The current result string in order to add the holdout results :param learning_rate_string: The current learning rate string in order to add information about the learning rate :return: the updated result_string and learning_rate_string """ # define folder to save the classifier and create it if not existing parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(path.dirname(path.abspath(__file__)))) # create the new save folder save_folder = get_save_folder(parent_folder, scorer_type+"_new") # only pass a save folder if the classifier should be saved best_clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder) # stack all the feature vectors of all the folds X_train = np.vstack(tuple([Xs[i] for i in range(10)])) y_train = np.hstack(tuple([ys[i] for i in range(10)])) # concat non-bleeding features X_train, X_holdout, feat_indices_holdout = concat_non_bleeding_features( X_train, X_holdout, non_bleeding_features, features_dir, 'holdout') # test for oversampling: fits the current classifier, oversampled with a given # method and checks the score on the holdout set use_over_sampling = False if use_over_sampling == True: from imblearn.over_sampling import SMOTE kind = ['regular', 'borderline1', 'borderline2', 'svm'] for m in kind: sm = SMOTE(kind=m) X_res, y_res = sm.fit_sample(X_train, y_train) best_clf.fit(X_res, y_res) y_predicted = best_clf.predict(X_holdout) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_holdout] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score " + m + ":" + str(score)) # fit the classifier best_clf.fit(X_train, y_train) # predict labels y_predicted = best_clf.predict(X_holdout) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_holdout] # calc FNC score fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calc accuracy for related/unrelated and stances accuracy_stance = score_calculation.get_accuracy(y_predicted, y_holdout, stance=True) accuracy_related = score_calculation.get_accuracy(y_predicted, y_holdout, stance=False) f1_stance = score_calculation.get_f1score(y_predicted, y_holdout, stance=True) f1_related = score_calculation.get_f1score(y_predicted, y_holdout, stance=False) # prepare printout for final results of holdout set printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print(printout) # print holdout results result_string += printout # add results to string that is going to be saved into a file # test saving and restoring model filename = scorer_type + ".sav" save_model(best_clf, save_folder,filename) load_clf = load_model(parent_folder + scorer_type + "_new_0/", filename) # the 0th folder should always exist print_score_from_restored_model(load_clf, X_holdout, y_holdout) # add to special file that shows learning rate and loss of optimizer if isinstance(best_clf, MultiThreadingFeedForwardMLP): learning_rate_string += best_clf.get_learning_rates('holdout') + "\n" # print feature importances if scorer_type == 'randomforest': result_file_folder = "%s" % (path.dirname(path.dirname(path.abspath(__file__)))) importances = best_clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in best_clf.estimators_], axis=0) indices = np.argsort(importances)[::-1] feat_indices.append(feat_indices_holdout) feat_importance_string = str(feat_indices) + "\n" for i in indices: feat_importance_string += str(i) + ";" + str(importances[i]) + ";" + str(std[i]) + "\n" # save feature importances as file printout_manager.save_file(feat_importance_string, result_file_folder + "/feat_importance_rf.txt", "a+") return result_string, learning_rate_string
def cross_validation(fold_stances, folds, Xs, ys, non_bleeding_features, features_dir, scorer_type, all_accuracies_related, all_accuracies_stance, all_f1_related, all_f1_stance, all_scores, result_string, learning_rate_string): best_score = 0 for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] # Add BOW features to current feature vectors # The features are specified in BOW_feature_list X_train, X_test, _ = concat_non_bleeding_features( X_train, X_test, non_bleeding_features, features_dir, fold) # get the estimator for this loop clf = esitmator_definitions.get_estimator(scorer_type) print("Begin fitting at: " + str(datetime.datetime.now()).split('.')[0] + "\n") # start fitting the estimator clf.fit(X_train, y_train) # predict the labes for fitted classifier with the test data predicted_int = clf.predict(X_test) predicted = [LABELS[int(a)] for a in predicted_int] actual = [LABELS[int(a)] for a in y_test] # calculate the FNC-1 score based on the predicted and the actual labels fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calculates accuracy and f1-macro scores accuracy_stance = score_calculation.get_accuracy(predicted_int, y_test, stance=True) accuracy_related = score_calculation.get_accuracy(predicted_int, y_test, stance=False) f1_stance = score_calculation.get_f1score(predicted_int, y_test, stance=True) f1_related = score_calculation.get_f1score(predicted_int, y_test, stance=False) # add the scores to the list holding the stores of all folds all_accuracies_related.append(accuracy_related) all_accuracies_stance.append(accuracy_stance) all_f1_related.append(f1_related) all_f1_stance.append(f1_stance) # get best score of all folds all_scores.append(score) if score > best_score: best_score = score # Prepare printout for fold result printout = printout_manager.get_foldwise_printout(fold, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print(printout) # print results for this fold result_string += printout # add results to final result file # add to special file that shows learning rate and loss of optimizer if isinstance(clf, MultiThreadingFeedForwardMLP): learning_rate_string += clf.get_learning_rates(fold) + "\n" # Prepare printout for final result printout = printout_manager.get_cross_validation_printout( all_accuracies_related, all_accuracies_stance, all_f1_related, all_f1_stance, all_scores, best_score) print(printout) # print cross validation results result_string += printout # add cross validation results to result file return result_string, learning_rate_string
def final_clf_training(Xs, ys, X_holdout, y_holdout, scorer_type, sanity_check=False): """ Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data :param Xs: All the training data's feature vectors, split in their folds :param ys: All the training data's labels, split in their folds :param X_holdout: The holdout feature vectors :param y_holdout: The holdout labels :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder) :param sanity_check: If true, the trained classifier predicts the labels of the data it was trained on and prints out the score :return: the final classifier """ # stack all the feature vectors of all the folds X_train = np.vstack(tuple([Xs[i] for i in range(10)])) y_train = np.hstack(tuple([ys[i] for i in range(10)])) # stack the holdout feature vectors on the feature vectors of all folds X_all = np.concatenate([X_train, X_holdout], axis=0) y_all = np.concatenate([y_train, y_holdout], axis=0) # define and create parent folder to save all trained classifiers into parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname( path.dirname(path.abspath(__file__)))) # create the new save folder for the specific classifer scorer_folder_name = scorer_type + "_final" save_folder = get_save_folder(parent_folder, scorer_folder_name + "_new") # get classifier and only pass a save folder if the classifier should be saved clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder) # fit the final classifier clf.fit(X_all, y_all) # save the model filename = scorer_folder_name + ".sav" save_model(clf, save_folder, filename) # save model with filename to specific folder # predict on the data the classifier was trained on => should give near perfect score if sanity_check == True: # get predicted and actual labels y_predicted = clf.predict(X_all) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_all] # calc FNC score fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calc accuracy, f1 macro accuracy_stance = score_calculation.get_accuracy(y_predicted, y_all, stance=True) accuracy_related = score_calculation.get_accuracy(y_predicted, y_all, stance=False) f1_stance = score_calculation.get_f1score(y_predicted, y_all, stance=True) f1_related = score_calculation.get_f1score(y_predicted, y_all, stance=False) # printout results printout = printout_manager.get_holdout_printout( save_folder, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print("SANITY CHECK:") print(printout) return clf
def validate_holdout(Xs, ys, X_holdout, y_holdout, non_bleeding_features, features_dir, scorer_type, feat_indices, result_string, learning_rate_string): """ Trains the classifier on all of the train+test data and tests it on the holdout set :param Xs: All the training data's feature vectors, split in their folds :param ys: All the training data's labels, split in their folds :param X_holdout: The holdout feature vectors :param y_holdout: The holdout labels :param non_bleeding_features: The list of non-bleeding features that has to be concatenated to the existing feature vectors :param features_dir: the directory where the features are stored :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder) :param feat_indices: indices returned by generate_features() method. They indicate at what index of the feature vector a specific feature starts and where it ends. This is used for printing out the feature importances by the RandomForest classifier :param result_string: The current result string in order to add the holdout results :param learning_rate_string: The current learning rate string in order to add information about the learning rate :return: the updated result_string and learning_rate_string """ # define folder to save the classifier and create it if not existing parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname( path.dirname(path.abspath(__file__)))) # create the new save folder save_folder = get_save_folder(parent_folder, scorer_type + "_new") # only pass a save folder if the classifier should be saved best_clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder) # stack all the feature vectors of all the folds X_train = np.vstack(tuple([Xs[i] for i in range(10)])) y_train = np.hstack(tuple([ys[i] for i in range(10)])) # concat non-bleeding features X_train, X_holdout, feat_indices_holdout = concat_non_bleeding_features( X_train, X_holdout, non_bleeding_features, features_dir, 'holdout') # test for oversampling: fits the current classifier, oversampled with a given # method and checks the score on the holdout set use_over_sampling = False if use_over_sampling == True: from imblearn.over_sampling import SMOTE kind = ['regular', 'borderline1', 'borderline2', 'svm'] for m in kind: sm = SMOTE(kind=m) X_res, y_res = sm.fit_sample(X_train, y_train) best_clf.fit(X_res, y_res) y_predicted = best_clf.predict(X_holdout) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_holdout] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score " + m + ":" + str(score)) # fit the classifier best_clf.fit(X_train, y_train) # predict labels y_predicted = best_clf.predict(X_holdout) predicted = [LABELS[int(a)] for a in y_predicted] actual = [LABELS[int(a)] for a in y_holdout] # calc FNC score fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calc accuracy for related/unrelated and stances accuracy_stance = score_calculation.get_accuracy(y_predicted, y_holdout, stance=True) accuracy_related = score_calculation.get_accuracy(y_predicted, y_holdout, stance=False) f1_stance = score_calculation.get_f1score(y_predicted, y_holdout, stance=True) f1_related = score_calculation.get_f1score(y_predicted, y_holdout, stance=False) # prepare printout for final results of holdout set printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print(printout) # print holdout results result_string += printout # add results to string that is going to be saved into a file # test saving and restoring model filename = scorer_type + ".sav" save_model(best_clf, save_folder, filename) load_clf = load_model(parent_folder + scorer_type + "_0/", filename) # the 0th folder should always exist print_score_from_restored_model(load_clf, X_holdout, y_holdout) # add to special file that shows learning rate and loss of optimizer if isinstance(best_clf, MultiThreadingFeedForwardMLP): learning_rate_string += best_clf.get_learning_rates('holdout') + "\n" # print feature importances if scorer_type == 'randomforest': result_file_folder = "%s" % (path.dirname( path.dirname(path.abspath(__file__)))) importances = best_clf.feature_importances_ std = np.std( [tree.feature_importances_ for tree in best_clf.estimators_], axis=0) indices = np.argsort(importances)[::-1] feat_indices.append(feat_indices_holdout) feat_importance_string = str(feat_indices) + "\n" for i in indices: feat_importance_string += str(i) + ";" + str( importances[i]) + ";" + str(std[i]) + "\n" # save feature importances as file printout_manager.save_file( feat_importance_string, result_file_folder + "/feat_importance_rf.txt", "a+") return result_string, learning_rate_string
def cross_validation(fold_stances, folds, Xs, ys, non_bleeding_features, features_dir, scorer_type, all_accuracies_related, all_accuracies_stance, all_f1_related, all_f1_stance, all_scores, result_string, learning_rate_string): best_score = 0 for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] # Add BOW features to current feature vectors # The features are specified in BOW_feature_list X_train, X_test, _ = concat_non_bleeding_features( X_train, X_test, non_bleeding_features, features_dir, fold) # get the estimator for this loop clf = esitmator_definitions.get_estimator(scorer_type) print("Begin fitting at: " + str(datetime.datetime.now()).split('.')[0] + "\n") # start fitting the estimator clf.fit(X_train, y_train) # predict the labes for fitted classifier with the test data predicted_int = clf.predict(X_test) predicted = [LABELS[int(a)] for a in predicted_int] actual = [LABELS[int(a)] for a in y_test] # calculate the FNC-1 score based on the predicted and the actual labels fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score # calculates accuracy and f1-macro scores accuracy_stance = score_calculation.get_accuracy(predicted_int, y_test, stance=True) accuracy_related = score_calculation.get_accuracy(predicted_int, y_test, stance=False) f1_stance = score_calculation.get_f1score(predicted_int, y_test, stance=True) f1_related = score_calculation.get_f1score(predicted_int, y_test, stance=False) # add the scores to the list holding the stores of all folds all_accuracies_related.append(accuracy_related) all_accuracies_stance.append(accuracy_stance) all_f1_related.append(f1_related) all_f1_stance.append(f1_stance) # get best score of all folds all_scores.append(score) if score > best_score: best_score = score # Prepare printout for fold result printout = printout_manager.get_foldwise_printout( fold, accuracy_related, accuracy_stance, f1_related, f1_stance, score) print(printout) # print results for this fold result_string += printout # add results to final result file # add to special file that shows learning rate and loss of optimizer if isinstance(clf, MultiThreadingFeedForwardMLP): learning_rate_string += clf.get_learning_rates(fold) + "\n" # Prepare printout for final result printout = printout_manager.get_cross_validation_printout( all_accuracies_related, all_accuracies_stance, all_f1_related, all_f1_stance, all_scores, best_score) print(printout) # print cross validation results result_string += printout # add cross validation results to result file return result_string, learning_rate_string