コード例 #1
0
def score_chunks_tertiary(X, id_num):
    """
    Score a single event based on:
    Features: body, subject and event
    Trained secondary classifier
    Outputs highest scoring predicted primary classification and score
    """

    X = secondary_multiple_events.score_chunks_secondary(X, id_num)

    # load classifier (stored as list of classifiers)
    clf_list, clf_names = ml_utils.load_classifier_list(id_num, 'tertiary')
    # load feature pipeline
    feature_pipeline = ml_utils.load_classifier(FEATURE_DIR + "tertiary_" +
                                                str(id_num))
    # transform data using feature pipeline

    x_test_features = tertiary_train_model.get_tertiary_testing_features(
        X, feature_pipeline)

    results, classes = ml_utils.get_classifier_results(clf_list, clf_names,
                                                       x_test_features, None)

    tertiary_pred, tertiary_score = tertiary_train_model.get_ensemble_prediction(
        results, classes)
    X['predicted_tertiary_class'] = tertiary_pred
    X['predicted_tertiary_score'] = tertiary_score

    X = tertiary_train_model.add_final_tertiary_classes(X)

    return X
コード例 #2
0
def main():
    """
    Main function that kicks off the program
    """

    start_time = time.time()
    logger.info("Training new tertiary model...")

    args = parser.parse_args()

    id_num = args.id

    df = pd.read_pickle(OUTPUT_DIR + "_secondary_dataset_" + str(id_num) +
                        ".pkl")

    y = df['true_tertiary_class']
    X = df[[
        'subject', 'text', 'event_type', 'final_primary', 'final_secondary',
        'predicted_primary_score', 'predicted_secondary_score'
    ]]

    logger.info("Size of X input: %s", X.shape)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    x_train_matrix, feature_pipeline = get_tertiary_training_features(
        x_train, y_train, TERTIARY_FEATURE_PIPELINE)
    x_test_matrix = get_tertiary_testing_features(x_test, feature_pipeline)

    save_feature_pipeline(feature_pipeline, 'tertiary', id_num)

    clf_list, clf_names = grid_search(x_train_matrix, y_train,
                                      CLASSIFIER_PIPELINE)

    results, classes = get_classifier_results(clf_list, clf_names,
                                              x_test_matrix, y_test)
    save_classifier(clf_list, 'tertiary', id_num)

    y_pred, y_score = get_ensemble_prediction(results, classes)
    _check_prediction_dimensions(y_test, y_pred, y_score)

    x_test = _decorate_with_tertiary(x_test, y_test, y_pred, y_score)
    x_test = add_final_tertiary_classes(x_test)

    # Calculate the accuracy of the model
    logger.info("------------------------------------------------")
    logger.info(
        "Overall Accuracy Tertiary: %s",
        accuracy_score(x_test.true_tertiary_class, x_test.final_tertiary))
    print(
        precision_recall_matrix(x_test.true_tertiary_class,
                                x_test.final_tertiary, classes))

    logger.info("Storing Tertiary model output and saving...")
    # saving true tertiary classes
    save_dataset(x_test, 'tertiary', id_num)

    logger.info('Completed training of tertiary model %s', id_num)
    logger.debug("Elapsed time: %s", time.time() - start_time)
コード例 #3
0
def main():
    """
    Main function that kicks off training
    """

    start_time = time.time()
    logger.info("Training new primary model...")

    X, y = get_primary_inputs()

    x_train, x_test, y_train, y_test, _, idx2 = train_test_split(X,
                                                                 y.p_class,
                                                                 X.index,
                                                                 test_size=0.4)

    # saving for future models
    y_secondary = y.s_class[idx2]
    y_tertiary = y.t_class[idx2]

    x_train_matrix, feature_pipeline = get_primary_training_features(
        x_train, y_train, PRIMARY_FEATURE_PIPELINE)

    x_test_matrix = get_primary_testing_features(x_test, feature_pipeline)

    id_num = save_feature_pipeline(feature_pipeline, 'primary', None)

    clf_list, clf_names = grid_search(x_train_matrix, y_train,
                                      CLASSIFIER_PIPELINE)

    results, classes = get_classifier_results(clf_list, clf_names,
                                              x_test_matrix, y_test)
    save_classifier(clf_list, 'primary', id_num)

    y_pred, y_score = get_ensemble_prediction(results, classes)
    _check_prediction_dimensions(y_test, y_pred, y_score)

    # Calculate the accuracy of the model
    logger.info("------------------------------------------------")
    logger.info("Overall Accuracy Primary: %s", accuracy_score(y_test, y_pred))
    print(precision_recall_matrix(y_test, y_pred, classes))

    logger.info("Storing Primary model output and saving...")
    # saving true secondary classes
    x_test = decorate_with_primary(x_test, y_test, y_pred, y_score,
                                   y_secondary, y_tertiary)
    save_dataset(x_test, 'primary', id_num)

    logger.info('Completed training of primary model %s', id_num)
    logger.debug("Elapsed time: %s", time.time() - start_time)
コード例 #4
0
def score_chunks_primary(X, id_num):
    """
    Score a single event based on:
    Features: body, subject and event
    Trained primary classifier
    Outputs highest scoring predicted primary classification and score
    """

    # load classifier (stored as list of classifiers)
    clf_list, clf_names = ml_utils.load_classifier_list(id_num, 'primary')
    # load feature pipeline
    feature_pipeline = ml_utils.load_classifier(FEATURE_DIR + "primary_" +
                                                str(id_num))
    # transform data using feature pipeline
    x_test_features = primary_train_model.get_primary_testing_features(
        X, feature_pipeline)

    results, classes = ml_utils.get_classifier_results(clf_list, clf_names,
                                                       x_test_features, None)

    y_pred, y_score = primary_train_model.get_ensemble_prediction(
        results, classes)

    return y_pred, y_score