def test_balanced_accuracy(): """Tests ensure the accuracy of accuracy calculations!""" num_trials = 10 for num_classes in np.random.randint(2, 100, num_trials): cm_100 = np.zeros((num_classes, num_classes), int) # no errors! sizes are imbalanced np.fill_diagonal(cm_100, np.random.randint(10, 100, num_classes)) if not np.isclose(balanced_accuracy(cm_100), 1.0): raise ArithmeticError('accuracy calculations on perfect classifier ' 'does not return 100% accuracy!!') cm_100perc_wrong = np.random.randint(10, 100, (num_classes, num_classes)) # ALL errors! sizes are imbalanced np.fill_diagonal(cm_100perc_wrong, 0.0) if not np.isclose(balanced_accuracy(cm_100perc_wrong), 0.0): raise ArithmeticError('accuracy calculations on 100% wrong classifier ' 'does not return 0% accuracy!!') cm = np.random.randint(10, 100, (num_classes, num_classes)).astype('float64') np.fill_diagonal(cm, 0) cls_sizes_wout_diag_elem = cm.sum(axis=1) chosen_accuracy = np.random.rand(num_classes) factor = chosen_accuracy / (1.0 - chosen_accuracy) # filling the diag in order to reach certain level of chosen accuracy diag_values = np.around(cls_sizes_wout_diag_elem * factor).astype('float64') np.fill_diagonal(cm, diag_values) computed_acc = balanced_accuracy(cm) expected_acc = np.mean(chosen_accuracy) if not np.isclose(computed_acc, expected_acc, atol=1e-2): raise ArithmeticError( 'accuracy calculations do not match the expected!!\n' ' Expected : {:.8f}\n' ' Estimated: {:.8f}\n' ' Differ by: {:.8f}\n' ''.format(expected_acc, computed_acc, expected_acc - computed_acc))
def holdout_trial_compare_datasets(datasets, impute_strategy, train_size_common, feat_sel_size, train_perc, total_test_samples, num_classes, num_features_per_dataset, class_set, method_names, pos_class_index, out_results_dir, grid_search_level, classifier_name, feat_select_method, rep_id=None): """ Runs a single iteration of optimizing the chosen pipeline on the chosen training set, and evaluations on the given test set. Parameters ---------- datasets impute_strategy : str Strategy to handle the missing data: whether to raise an error if data is missing, or to impute them using the method chosen here. train_size_common feat_sel_size train_perc total_test_samples num_classes num_features_per_dataset class_set method_names pos_class_index out_results_dir rep_id grid_search_level : str If 'light', grid search resolution will be reduced to speed up optimization. If 'exhaustive', broadest range of values for most parameters will be used for optimization. Returns ------- """ common_ds = datasets[cfg.COMMON_DATASET_INDEX] num_datasets = len(datasets) # multi-class metrics confusion_matrix = np.full([num_classes, num_classes, num_datasets], np.nan) accuracy_balanced = np.full(num_datasets, np.nan) auc_weighted = np.full(num_datasets, np.nan) best_params = [None] * num_datasets misclsfd_ids_this_run = [None] * num_datasets feature_importances = [None] * num_datasets for idx in range(num_datasets): feature_importances[idx] = np.full(num_features_per_dataset[idx], np.nan) # set of subjects for training and testing, common for all datasets. train_set, test_set = common_ds.train_test_split_ids( count_per_class=train_size_common) # NOTE test labels are the same for all datasets - each feature/model # combination is being evaluated against the same set of test samplets true_test_labels = np.array([ common_ds.targets[sid] for sid in test_set if sid in common_ds.targets ]) pred_prob_per_class = np.full( [num_datasets, total_test_samples, num_classes], np.nan) pred_labels_per_rep_fs = np.empty([num_datasets, total_test_samples], dtype=true_test_labels.dtype) # to uniquely identify this iteration if rep_id is None: rep_proc_id = 'process{}'.format(os.getpid()) # str(os.getpid()) else: rep_proc_id = str(rep_id) print_options = get_pretty_print_options(method_names, num_datasets) # evaluating each feature/dataset for dd in range(num_datasets): print("CV trial {rep:6} " "feature {index:{nd}} " "{name:>{namewidth}} : " "".format(rep=rep_proc_id, index=dd, name=method_names[dd], nd=print_options.num_digits, namewidth=print_options.str_width), end='') # using the same train/test sets for all feature sets. train_fs = datasets[dd].get_subset(train_set) test_fs = datasets[dd].get_subset(test_set) pred_prob_per_class[dd, :, :], pred_labels_per_rep_fs[dd,:], \ _ignored_true_test_labels, conf_mat, misclsfd_ids_this_run[dd], \ feature_importances[dd], best_params[dd] = \ eval_optimized_model_on_testset(train_fs, test_fs, impute_strategy=impute_strategy, train_perc=train_perc, feat_sel_size=feat_sel_size, label_order_in_conf_matrix=class_set, grid_search_level=grid_search_level, classifier_name=classifier_name, feat_select_method=feat_select_method) # TODO new feature: add additional metrics such as PPV accuracy_balanced[dd] = balanced_accuracy(conf_mat) confusion_matrix[:, :, dd] = conf_mat print('balanced accuracy: {:.4f} '.format(accuracy_balanced[dd]), end='') if num_classes == 2: auc_weighted[dd] = roc_auc_score( true_test_labels, pred_prob_per_class[dd, :, pos_class_index], average='weighted') print('\t weighted AUC: {:.4f}'.format(auc_weighted[dd]), end='') print('', flush=True) sys.stdout.flush() sys.stderr.flush() results_list = [ pred_prob_per_class, pred_labels_per_rep_fs, true_test_labels, accuracy_balanced, confusion_matrix, auc_weighted, feature_importances, best_params, misclsfd_ids_this_run, test_set ] tmp_dir = get_temp_dir(out_results_dir) out_path = pjoin(tmp_dir, '{}_{}.pkl'.format(cfg.temp_prefix_rhst, rep_proc_id)) logging.info('results from rep {} saved to {}'.format( rep_proc_id, out_path)) with open(out_path, 'bw') as of: pickle.dump(results_list, of) return results_list