def main(): df = pandas.read_csv(args.input_filename, index_col=False, header=0) data = df.values column_names = df.columns.values.tolist() feature_names = column_names[0:args.label_column] label_name = column_names[args.label_column] # Extract features/labels and their names from raw data features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) orig_train_features, orig_test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=args.test_size)) (model, train_features, train_labels, test_features) = models.train_knn( orig_train_features, train_labels, orig_test_features, args.imbalanced_data, args.train_size, args.scaling_method, args.minmax_min, args.minmax_max, args.skip_feature_selection, args.skip_grid_search, args.num_neighbors, args.weights, args.algorithm, args.metric, args.num_jobs) # Report accuracy y_true, y_pred = test_labels, model.predict(test_features) predicted_probabilities = model.predict_proba(test_features) print("Test Accuracy: %0.2f%%" % (model.score(test_features, test_labels)*100.)) print('AUC score: %0.5f' % roc_auc_score(y_true, predicted_probabilities[:,1])) # full report print("\n*****************************\n") labels = [0 , 1] target_names = ["female" , "male"] print(classification_report(y_true, y_pred, labels, target_names)) # Now perform the evaluation on the test data at different probability thresholds. # The idea is we report the accuracy only for points whose predicted probability # for either label is above the specified threshold. utils.print_threshold_metrics(predicted_probabilities, y_true, labels) # write test features along with last bit indicating whether prediction was correct. if args.output_filename_prefix: utils.write_data_predictions(args.output_filename_prefix, orig_test_features, feature_names, y_true, y_pred)
def compute_trial_metrics(data): features = data[0] labels = data[1] train_sizes = data[2] random_seed = data[3] random.seed(random_seed) train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=args.test_size, random_state=random.randint(1,99999999))) # mapping from train size to any of "accuracy", "precision"... to a value trial_metrics = defaultdict(dict) for train_size in train_sizes: if args.learning_algorithm == 'random-forest': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_random_forest( train_features, train_labels, test_features, args.scikit_balancing, train_size, args.skip_feature_selection, args.skip_grid_search, args.rf_max_features, args.rf_num_trees, args.rf_criterion, args.rf_min_samples_split, args.rf_min_samples_leaf, 1) elif args.learning_algorithm == 'svm': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_svm( train_features, train_labels, test_features, args.scikit_balancing, train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.svm_kernel, args.svm_gamma, args.svm_cost, args.svm_degree, 1) elif args.learning_algorithm == 'logistic': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_logistic( train_features, train_labels, test_features, args.scikit_balancing, train_size, args.skip_feature_selection, args.skip_grid_search, args.logistic_penalty, args.logistic_cost, args.logistic_dual, args.logistic_tolerance, 1) elif args.learning_algorithm == 'knn': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_knn( train_features, train_labels, test_features, args.scikit_balancing, train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.knn_num_neighbors, args.knn_weights, args.knn_algorithm, args.knn_metric, 1) else: sys.exit('bad algorithm name.') y_true, y_pred = test_labels, model.predict(transformed_test_features) # size of labels in train/test. test_size = transformed_test_features.shape[0] test_female_size = sum(test_labels[:] == utils.FEMALE) test_male_size = sum(test_labels[:] == utils.MALE) # Compute evaluation metrics test_accuracy = model.score(transformed_test_features, test_labels)*100. (precisions, recalls, fscores, supports) = precision_recall_fscore_support( y_true=y_true, y_pred=y_pred, labels=[0, 1]) # Get true/false positive/negative confusion = confusion_matrix(y_true, y_pred) test_true_female = confusion[utils.FEMALE][utils.FEMALE] test_false_female = confusion[utils.MALE][utils.FEMALE] test_true_male = confusion[utils.MALE][utils.MALE] test_false_male = confusion[utils.FEMALE][utils.MALE] trial_metrics[train_size]["train_size"] = train_size trial_metrics[train_size]["test_size"] = test_size trial_metrics[train_size]["test_female_size"] = test_female_size trial_metrics[train_size]["test_male_size"] = test_male_size trial_metrics[train_size]["test_true_female"] = test_true_female trial_metrics[train_size]["test_false_female"] = test_false_female trial_metrics[train_size]["test_true_male"] = test_true_male trial_metrics[train_size]["test_false_male"] = test_false_male trial_metrics[train_size]["test_accuracy"] = test_accuracy trial_metrics[train_size]["test_female_precision"] = precisions[utils.FEMALE] trial_metrics[train_size]["test_male_precision"] = precisions[utils.MALE] trial_metrics[train_size]["test_female_recall"] = recalls[utils.FEMALE] trial_metrics[train_size]["test_male_recall"] = recalls[utils.MALE] return trial_metrics
def compute_trial_metrics(data): female_features = data[0] female_labels = data[1] male_features = data[2] male_labels = data[3] test_actual_ratios = data[4] random_seed = data[5] random.seed(random_seed) num_males = male_features.shape[0] num_females = female_features.shape[0] # Make sure you seed the random state since each subprocess will receive the same state, # so all random numbers will become identical! numpy.random.seed(random_seed) # mapping from train size to any of "accuracy", "precision"... to a value trial_metrics = defaultdict(dict) for test_actual_ratio in test_actual_ratios: # construct test set with given ratio of female to test size test_female_size = int(1.0 * args.test_size * test_actual_ratio) test_male_size = args.test_size - test_female_size if num_females < test_female_size: sys.exit('Not enough female samples: ' + str(test_female_size) + ' for ratio: ' + str(test_actual_ratio)) if num_males < test_male_size: sys.exit('Not enough male samples: ' + str(test_male_size) + ' for ratio: ' + str(test_actual_ratio)) test_female_indices = numpy.random.choice(num_females, test_female_size, replace=False) test_male_indices = numpy.random.choice(num_males, test_male_size, replace=False) female_mask = numpy.zeros(num_females, dtype=bool) female_mask[test_female_indices] = True male_mask = numpy.zeros(num_males, dtype=bool) male_mask[test_male_indices] = True test_female_features = female_features[female_mask, :] test_female_labels = female_labels[female_mask] test_male_features = male_features[male_mask, :] test_male_labels = male_labels[male_mask] train_female_features = female_features[~female_mask, :] train_female_labels = female_labels[~female_mask] train_male_features = male_features[~male_mask, :] train_male_labels = male_labels[~male_mask] test_features = numpy.concatenate( (test_female_features, test_male_features)) test_labels = numpy.concatenate((test_female_labels, test_male_labels)) train_features = numpy.concatenate( (train_female_features, train_male_features)) train_labels = numpy.concatenate( (train_female_labels, train_male_labels)) if args.learning_algorithm == 'random-forest': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_random_forest( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, args.skip_feature_selection, args.skip_grid_search, args.rf_max_features, args.rf_num_trees, args.rf_criterion, args.rf_min_samples_split, args.rf_min_samples_leaf, 1) elif args.learning_algorithm == 'svm': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_svm( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.svm_kernel, args.svm_gamma, args.svm_cost, args.svm_degree, 1) elif args.learning_algorithm == 'logistic': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_logistic( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, args.skip_feature_selection, args.skip_grid_search, args.logistic_penalty, args.logistic_cost, args.logistic_dual, args.logistic_tolerance, 1) elif args.learning_algorithm == 'knn': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_knn( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.knn_num_neighbors, args.knn_weights, args.knn_algorithm, args.knn_metric, 1) else: sys.exit('bad algorithm name.') # size of labels in train/test train_size = transformed_train_features.shape[0] train_female_size = sum(transformed_train_labels[:] == utils.FEMALE) train_male_size = sum(transformed_train_labels[:] == utils.MALE) test_size = transformed_test_features.shape[0] test_actual_female_size = sum(test_labels[:] == utils.FEMALE) test_actual_male_size = sum(test_labels[:] == utils.MALE) # train performance y_true, y_pred = transformed_train_labels, model.predict( transformed_train_features) train_predicted_female_size = sum(y_pred[:] == utils.FEMALE) train_predicted_male_size = sum(y_pred[:] == utils.MALE) confusion = confusion_matrix(y_true, y_pred) train_true_female = confusion[utils.FEMALE][utils.FEMALE] train_false_female = confusion[utils.MALE][utils.FEMALE] train_true_male = confusion[utils.MALE][utils.MALE] train_false_male = confusion[utils.FEMALE][utils.MALE] train_accuracy = model.score(transformed_train_features, transformed_train_labels) * 100. (train_precisions, train_recalls, fscores, supports) = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=[0, 1]) # test performance y_true, y_pred = test_labels, model.predict(transformed_test_features) test_predicted_female_size = sum(y_pred[:] == utils.FEMALE) test_predicted_male_size = sum(y_pred[:] == utils.MALE) test_predicted_ratio = (1.0 * test_predicted_female_size) / test_size confusion = confusion_matrix(y_true, y_pred) test_true_female = confusion[utils.FEMALE][utils.FEMALE] test_false_female = confusion[utils.MALE][utils.FEMALE] test_true_male = confusion[utils.MALE][utils.MALE] test_false_male = confusion[utils.FEMALE][utils.MALE] test_accuracy = model.score(transformed_test_features, test_labels) * 100. (test_precisions, test_recalls, fscores, supports) = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=[0, 1]) trial_metrics[test_actual_ratio]["train_size"] = train_size trial_metrics[test_actual_ratio][ "train_female_size"] = train_female_size trial_metrics[test_actual_ratio]["train_male_size"] = train_male_size trial_metrics[test_actual_ratio][ "train_predicted_female_size"] = train_predicted_female_size trial_metrics[test_actual_ratio][ "train_predicted_male_size"] = train_predicted_male_size trial_metrics[test_actual_ratio][ "train_true_female"] = train_true_female trial_metrics[test_actual_ratio][ "train_false_female"] = train_false_female trial_metrics[test_actual_ratio]["train_true_male"] = train_true_male trial_metrics[test_actual_ratio]["train_false_male"] = train_false_male trial_metrics[test_actual_ratio]["train_accuracy"] = train_accuracy trial_metrics[test_actual_ratio][ "train_female_precision"] = train_precisions[utils.FEMALE] trial_metrics[test_actual_ratio][ "train_male_precision"] = train_precisions[utils.MALE] trial_metrics[test_actual_ratio][ "train_female_recall"] = train_recalls[utils.FEMALE] trial_metrics[test_actual_ratio]["train_male_recall"] = train_recalls[ utils.MALE] trial_metrics[test_actual_ratio]["test_size"] = test_size trial_metrics[test_actual_ratio][ "test_actual_ratio"] = test_actual_ratio trial_metrics[test_actual_ratio][ "test_actual_female_size"] = test_actual_female_size trial_metrics[test_actual_ratio][ "test_actual_male_size"] = test_actual_male_size trial_metrics[test_actual_ratio][ "test_predicted_ratio"] = test_predicted_ratio trial_metrics[test_actual_ratio][ "test_predicted_female_size"] = test_predicted_female_size trial_metrics[test_actual_ratio][ "test_predicted_male_size"] = test_predicted_male_size trial_metrics[test_actual_ratio]["test_true_female"] = test_true_female trial_metrics[test_actual_ratio][ "test_false_female"] = test_false_female trial_metrics[test_actual_ratio]["test_true_male"] = test_true_male trial_metrics[test_actual_ratio]["test_false_male"] = test_false_male trial_metrics[test_actual_ratio]["test_accuracy"] = test_accuracy trial_metrics[test_actual_ratio][ "test_female_precision"] = test_precisions[utils.FEMALE] trial_metrics[test_actual_ratio][ "test_male_precision"] = test_precisions[utils.MALE] trial_metrics[test_actual_ratio]["test_female_recall"] = test_recalls[ utils.FEMALE] trial_metrics[test_actual_ratio]["test_male_recall"] = test_recalls[ utils.MALE] return trial_metrics
def compute_trial_metrics(df, filtering_thresholds, filtering_column, random_seed): random.seed(random_seed) # mapping from filtering threshold to any of "accuracy", "precision"... to a value trial_metrics = defaultdict(dict) for filtering_threshold in filtering_thresholds: # first extract the piece of data satisfying the requested threshold and then split to # test/train filtered_df = df[df[filtering_column] >= filtering_threshold] percentage_data = (100.0 * len(filtered_df.index)) / len(df.index) data = filtered_df.values features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=args.test_size, random_state=random.randint( 1, 99999999))) assert train_features.shape[0] >= args.train_size if args.learning_algorithm == 'random-forest': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_random_forest( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, args.skip_feature_selection, args.skip_grid_search, args.rf_max_features, args.rf_num_trees, args.rf_criterion, args.rf_min_samples_split, args.rf_min_samples_leaf, args.num_processes) elif args.learning_algorithm == 'svm': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_svm( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.svm_kernel, args.svm_gamma, args.svm_cost, args.svm_degree, args.num_processes) elif args.learning_algorithm == 'logistic': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_logistic( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, args.skip_feature_selection, args.skip_grid_search, args.logistic_penalty, args.logistic_cost, args.logistic_dual, args.logistic_tolerance, args.num_processes) elif args.learning_algorithm == 'knn': (model, transformed_train_features, transformed_train_labels, transformed_test_features) = models.train_knn( train_features, train_labels, test_features, args.scikit_balancing, args.train_size, 'minmax', 0, 1, args.skip_feature_selection, args.skip_grid_search, args.knn_num_neighbors, args.knn_weights, args.knn_algorithm, args.knn_metric, args.num_processes) else: sys.exit('bad algorithm name.') y_true, y_pred = test_labels, model.predict(transformed_test_features) predicted_probabilities = model.predict_proba( transformed_test_features) # size of labels in train/test. train_size = transformed_train_features.shape[0] train_female_size = sum(transformed_train_labels[:] == utils.FEMALE) train_male_size = sum(transformed_train_labels[:] == utils.MALE) test_size = transformed_test_features.shape[0] test_female_size = sum(test_labels[:] == utils.FEMALE) test_male_size = sum(test_labels[:] == utils.MALE) # Compute evaluation metrics test_accuracy = model.score(transformed_test_features, test_labels) * 100. test_AUC = roc_auc_score(y_true, predicted_probabilities[:, 1]) (precisions, recalls, f1scores, supports) = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=[0, 1]) (ave_precision, ave_recall, ave_f1score, ave_support) = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=[0, 1], average='macro') # Get true/false positive/negative confusion = confusion_matrix(y_true, y_pred) test_true_female = confusion[utils.FEMALE][utils.FEMALE] test_false_female = confusion[utils.MALE][utils.FEMALE] test_true_male = confusion[utils.MALE][utils.MALE] test_false_male = confusion[utils.FEMALE][utils.MALE] trial_metrics[filtering_threshold][ "min_active_days"] = filtering_threshold trial_metrics[filtering_threshold]["percentage_data"] = percentage_data trial_metrics[filtering_threshold]["train_size"] = train_size trial_metrics[filtering_threshold][ "train_female_size"] = train_female_size trial_metrics[filtering_threshold]["train_male_size"] = train_male_size trial_metrics[filtering_threshold]["test_size"] = test_size trial_metrics[filtering_threshold][ "test_female_size"] = test_female_size trial_metrics[filtering_threshold]["test_male_size"] = test_male_size trial_metrics[filtering_threshold][ "test_true_female"] = test_true_female trial_metrics[filtering_threshold][ "test_false_female"] = test_false_female trial_metrics[filtering_threshold]["test_true_male"] = test_true_male trial_metrics[filtering_threshold]["test_false_male"] = test_false_male trial_metrics[filtering_threshold]["test_accuracy"] = test_accuracy trial_metrics[filtering_threshold]["test_AUC"] = test_AUC trial_metrics[filtering_threshold][ "test_average_precision"] = ave_precision trial_metrics[filtering_threshold][ "test_female_precision"] = precisions[utils.FEMALE] trial_metrics[filtering_threshold]["test_male_precision"] = precisions[ utils.MALE] trial_metrics[filtering_threshold]["test_average_recall"] = ave_recall trial_metrics[filtering_threshold]["test_female_recall"] = recalls[ utils.FEMALE] trial_metrics[filtering_threshold]["test_male_recall"] = recalls[ utils.MALE] trial_metrics[filtering_threshold][ "test_average_f1score"] = ave_f1score trial_metrics[filtering_threshold]["test_female_f1score"] = f1scores[ utils.FEMALE] trial_metrics[filtering_threshold]["test_male_f1score"] = f1scores[ utils.MALE] return trial_metrics
def main(config): num_iters = 1 batch_size = 4096 data_dir = "data" data_limits = [0.2, 0.4, 0.6, 0.8, 1.0] iter_train_files, iter_test_files = split_train_test(data_dir, num_iters) model_names = ["rf", "mlp", "knn", "svm"] for limit in data_limits: model_bench_marks = { model: { "accuracies": [], "prediction_times": [], "training_times": [] } for model in model_names } print("Training and testing with limit {}".format(limit)) for i in range(num_iters): train_files, test_files = iter_train_files[i], iter_test_files[i] utils.save_list_as_text(test_files, "test_files.txt") x_train, y_train, x_dev, y_dev = preprocess_data( train_files, config) x_train, _, y_train, _ = train_test_split(x_train, y_train, test_size=0.0001) limited_train_size, limited_test_size = int( len(x_train) * limit), int(len(x_dev) * limit) x_train, y_train = x_train[: limited_train_size], y_train[: limited_train_size] x_dev, y_dev = x_dev[:limited_test_size], y_dev[:limited_test_size] x_train_batches, y_train_batches = divide_into_batches( x_train, y_train, batch_size) x_dev_batches, y_dev_batches = divide_into_batches( x_dev, y_dev, batch_size) for model_name in model_names: start_train_time = time.time() if model_name == "rf": trained_model, scaler = train_rf_batches( x_train_batches, y_train_batches, x_dev_batches, y_dev_batches) elif model_name == "mlp": trained_model, scaler = train_mlp(x_train, y_train, x_dev, y_dev) elif model_name == "knn": trained_model, scaler = train_knn(x_train, y_train, x_dev, y_dev) elif model_name == "svm": trained_model, scaler = train_svm(x_train, y_train, x_dev, y_dev) else: raise ValueError("Unsupported model {}".format(model_name)) train_time = time.time() - start_train_time iter_accuracy, iter_prediction_time = test( trained_model, scaler, test_files, config) model_bench_marks[model_name]["accuracies"].append( iter_accuracy) model_bench_marks[model_name]["prediction_times"].append( iter_prediction_time) model_bench_marks[model_name]["training_times"].append( train_time) print("Total training time {}".format(train_time)) for model, results in model_bench_marks.items(): model_bench_marks[model]["accuracies"] = np.mean( model_bench_marks[model]["accuracies"]) model_bench_marks[model]["prediction_times"] = np.mean( model_bench_marks[model]["prediction_times"]) model_bench_marks[model]["training_times"] = np.mean( model_bench_marks[model]["training_times"]) with open("benchmark_{}.json".format(limit), "w") as f: json.dump(model_bench_marks, f)
def main(): (train_features, train_labels, test_features, test_labels, class_values, class_names, feature_label_names) = utils.prepare_data( args.input_filename, args.label_column, args.train_size, args.test_size, args.imbalanced_data) # now that we have limited the data to requested train size, scale data since knn needs # to be scaled (train_feautres, test_features) = utils.scale_data(train_features, test_features, args.scaling_method) # feature selection if requested if args.feature_selection_algo: feature_selector_obj = feature_selection.feature_selector( args.evaluation, train_features, train_labels, feature_label_names, -1, penalty_weights, args.feature_selection_algo, args.num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print "Selected " + str( len(feature_selector_obj.get_selected_features())) + " features" print "Top 10 features: " + str( feature_selector_obj.get_top_features(10)) model = models.train_knn(train_features, train_labels, args.skip_grid_search, args.evaluation, args.num_jobs, args.num_neighbors, args.weights, args.algorithm, args.metric) # Predict test and report full stats y_true, y_pred = test_labels, model.predict(test_features) print("\n*****************************\n") print( 'MAE: ' + str( metrics.mean_absolute_error( y_true, y_pred, multioutput='uniform_average'))) print( 'MSE: ' + str( metrics.mean_squared_error( y_true, y_pred, multioutput='uniform_average'))) print('Classification report:') print( metrics.classification_report(y_true, y_pred, class_values, class_names)) print('Precision Recall') print( metrics.precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='weighted')) # print and plot confusion matrix print('Confusion Matrix Without Normalization') numpy.set_printoptions(precision=2) cm = metrics.confusion_matrix(y_true, y_pred, class_values) print(cm) print('Confusion Matrix With Normalization') cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis] print(cm_normalized) plt.figure() plt.subplot(2, 1, 1) utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix') # Normalize the confusion matrix by row (i.e by the number of samples # in each class) plt.subplot(2, 1, 2) utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix') #plt.savefig(args.output_figure + '.pdf') pdf = PdfPages(args.output_figure + '.pdf') plt.savefig(pdf, format='pdf') pdf.close()