Esempio n. 1
def final_clf_training(Xs, ys, X_holdout, y_holdout, scorer_type, sanity_check=False):
    Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param scorer_type: the scorer type, e.g. MLB_base (see in utils folder)
    :param sanity_check: If true, the trained classifier predicts the labels of the data it was trained on and prints out the score
    :return: the final classifier

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # stack the holdout feature vectors on the feature vectors of all folds
    X_all = np.concatenate([X_train, X_holdout], axis=0)
    y_all = np.concatenate([y_train, y_holdout], axis=0)

    # define and create parent folder to save all trained classifiers into
    parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(path.dirname(path.abspath(__file__))))

    # create the new save folder for the specific classifer
    scorer_folder_name = scorer_type+"_final"
    save_folder = get_save_folder(parent_folder, scorer_folder_name+"_new")

    # get classifier and only pass a save folder if the classifier should be saved
    clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder)

    # fit the final classifier, y_all)

    # save the model
    filename = scorer_folder_name + ".sav"
    save_model(clf, save_folder, filename)  # save model with filename to specific folder

    # predict on the data the classifier was trained on => should give near perfect score
    if sanity_check == True:
        # get predicted and actual labels
        y_predicted = clf.predict(X_all)
        predicted = [LABELS[int(a)] for a in y_predicted]
        actual = [LABELS[int(a)] for a in y_all]

        # calc FNC score
        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score

        # calc accuracy, f1 macro
        accuracy_stance = score_calculation.get_accuracy(y_predicted, y_all, stance=True)
        accuracy_related = score_calculation.get_accuracy(y_predicted, y_all, stance=False)
        f1_stance = score_calculation.get_f1score(y_predicted, y_all, stance=True)
        f1_related = score_calculation.get_f1score(y_predicted, y_all, stance=False)

        # printout results
        printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related,
                                                         f1_stance, score)
        print("SANITY CHECK (predict on train data):")
    return clf
