def score_chunks_tertiary(X, id_num): """ Score a single event based on: Features: body, subject and event Trained secondary classifier Outputs highest scoring predicted primary classification and score """ X = secondary_multiple_events.score_chunks_secondary(X, id_num) # load classifier (stored as list of classifiers) clf_list, clf_names = ml_utils.load_classifier_list(id_num, 'tertiary') # load feature pipeline feature_pipeline = ml_utils.load_classifier(FEATURE_DIR + "tertiary_" + str(id_num)) # transform data using feature pipeline x_test_features = tertiary_train_model.get_tertiary_testing_features( X, feature_pipeline) results, classes = ml_utils.get_classifier_results(clf_list, clf_names, x_test_features, None) tertiary_pred, tertiary_score = tertiary_train_model.get_ensemble_prediction( results, classes) X['predicted_tertiary_class'] = tertiary_pred X['predicted_tertiary_score'] = tertiary_score X = tertiary_train_model.add_final_tertiary_classes(X) return X
def main(): """ Main function that kicks off the program """ start_time = time.time() logger.info("Training new tertiary model...") args = parser.parse_args() id_num = args.id df = pd.read_pickle(OUTPUT_DIR + "_secondary_dataset_" + str(id_num) + ".pkl") y = df['true_tertiary_class'] X = df[[ 'subject', 'text', 'event_type', 'final_primary', 'final_secondary', 'predicted_primary_score', 'predicted_secondary_score' ]] logger.info("Size of X input: %s", X.shape) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4) x_train_matrix, feature_pipeline = get_tertiary_training_features( x_train, y_train, TERTIARY_FEATURE_PIPELINE) x_test_matrix = get_tertiary_testing_features(x_test, feature_pipeline) save_feature_pipeline(feature_pipeline, 'tertiary', id_num) clf_list, clf_names = grid_search(x_train_matrix, y_train, CLASSIFIER_PIPELINE) results, classes = get_classifier_results(clf_list, clf_names, x_test_matrix, y_test) save_classifier(clf_list, 'tertiary', id_num) y_pred, y_score = get_ensemble_prediction(results, classes) _check_prediction_dimensions(y_test, y_pred, y_score) x_test = _decorate_with_tertiary(x_test, y_test, y_pred, y_score) x_test = add_final_tertiary_classes(x_test) # Calculate the accuracy of the model logger.info("------------------------------------------------") logger.info( "Overall Accuracy Tertiary: %s", accuracy_score(x_test.true_tertiary_class, x_test.final_tertiary)) print( precision_recall_matrix(x_test.true_tertiary_class, x_test.final_tertiary, classes)) logger.info("Storing Tertiary model output and saving...") # saving true tertiary classes save_dataset(x_test, 'tertiary', id_num) logger.info('Completed training of tertiary model %s', id_num) logger.debug("Elapsed time: %s", time.time() - start_time)
def main(): """ Main function that kicks off training """ start_time = time.time() logger.info("Training new primary model...") X, y = get_primary_inputs() x_train, x_test, y_train, y_test, _, idx2 = train_test_split(X, y.p_class, X.index, test_size=0.4) # saving for future models y_secondary = y.s_class[idx2] y_tertiary = y.t_class[idx2] x_train_matrix, feature_pipeline = get_primary_training_features( x_train, y_train, PRIMARY_FEATURE_PIPELINE) x_test_matrix = get_primary_testing_features(x_test, feature_pipeline) id_num = save_feature_pipeline(feature_pipeline, 'primary', None) clf_list, clf_names = grid_search(x_train_matrix, y_train, CLASSIFIER_PIPELINE) results, classes = get_classifier_results(clf_list, clf_names, x_test_matrix, y_test) save_classifier(clf_list, 'primary', id_num) y_pred, y_score = get_ensemble_prediction(results, classes) _check_prediction_dimensions(y_test, y_pred, y_score) # Calculate the accuracy of the model logger.info("------------------------------------------------") logger.info("Overall Accuracy Primary: %s", accuracy_score(y_test, y_pred)) print(precision_recall_matrix(y_test, y_pred, classes)) logger.info("Storing Primary model output and saving...") # saving true secondary classes x_test = decorate_with_primary(x_test, y_test, y_pred, y_score, y_secondary, y_tertiary) save_dataset(x_test, 'primary', id_num) logger.info('Completed training of primary model %s', id_num) logger.debug("Elapsed time: %s", time.time() - start_time)
def score_chunks_primary(X, id_num): """ Score a single event based on: Features: body, subject and event Trained primary classifier Outputs highest scoring predicted primary classification and score """ # load classifier (stored as list of classifiers) clf_list, clf_names = ml_utils.load_classifier_list(id_num, 'primary') # load feature pipeline feature_pipeline = ml_utils.load_classifier(FEATURE_DIR + "primary_" + str(id_num)) # transform data using feature pipeline x_test_features = primary_train_model.get_primary_testing_features( X, feature_pipeline) results, classes = ml_utils.get_classifier_results(clf_list, clf_names, x_test_features, None) y_pred, y_score = primary_train_model.get_ensemble_prediction( results, classes) return y_pred, y_score