def evaluate_no_cv(self, train_index, test_index): x_train = self._x[train_index] y_train = self._y[train_index] x_test = self._x[test_index] y_test = self._y[test_index] best_parameter = dict() best_parameter['n_estimators'] = self._n_estimators_range best_parameter['max_depth'] = self._max_depth_range best_parameter['min_samples_split'] = self._min_samples_split_range best_parameter['max_features'] = self._max_features_range _, y_hat, auc, y_hat_train = self._launch_random_forest( x_train, x_test, y_train, y_test, self._n_estimators_range, self._max_depth_range, self._min_samples_split_range, self._max_features_range) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) best_parameter['balanced_accuracy'] = result['evaluation'][ 'balanced_accuracy'] result['evaluation_train'] = utils.evaluate_prediction( y_train, y_hat_train) result['y_hat'] = y_hat result['y_hat_train'] = y_hat_train result['y'] = y_test result['y_train'] = y_train result['y_index'] = test_index result['x_index'] = train_index result['auc'] = auc return result
def evaluate(self, train_index, test_index): inner_pool = ThreadPool(self._n_threads) async_result = {} for i in range(self._grid_search_folds): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) parameters_combinations = list( itertools.product(self._n_estimators_range, self._max_depth_range, self._min_samples_split_range, self._max_features_range)) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] x_train_inner = x_train[inner_train_index] x_test_inner = x_train[inner_test_index] y_train_inner = y_train[inner_train_index] y_test_inner = y_train[inner_test_index] for parameters in parameters_combinations: async_result[i][parameters] = inner_pool.apply_async( self._grid_search, (x_train_inner, x_test_inner, y_train_inner, y_test_inner, parameters[0], parameters[1], parameters[2], parameters[3])) inner_pool.close() inner_pool.join() best_parameter = self._select_best_parameter(async_result) x_test = self._x[test_index] y_test = self._y[test_index] _, y_hat, auc, y_hat_train = self._launch_random_forest( x_train, x_test, y_train, y_test, best_parameter['n_estimators'], best_parameter['max_depth'], best_parameter['min_samples_split'], best_parameter['max_features']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) result['evaluation_train'] = utils.evaluate_prediction( y_train, y_hat_train) result['y_hat'] = y_hat result['y_hat_train'] = y_hat_train result['y'] = y_test result['y_train'] = y_train result['y_index'] = test_index result['x_index'] = train_index result['auc'] = auc return result
def evaluate(self, train_index, test_index): inner_pool = ThreadPool(self._n_threads) async_result = {} for i in range(self._grid_search_folds): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] inner_kernel = outer_kernel[ inner_train_index, :][:, inner_train_index] x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[ inner_test_index] for c in self._c_range: print("Inner CV for C=%f... \n" % c) async_result[i][c] = inner_pool.apply_async( self._grid_search, args=(inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) #print i, c, async_result[i][c] inner_pool.close() inner_pool.join() best_parameter = self._select_best_parameter(async_result) x_test = self._kernel[test_index, :][:, train_index] y_train, y_test = self._y[train_index], self._y[test_index] _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test, y_train, y_test, best_parameter['c']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) result['evaluation_train'] = utils.evaluate_prediction( y_train, y_hat_train) result['y_hat'] = y_hat result['y_hat_train'] = y_hat_train result['y'] = y_test result['y_train'] = y_train result['y_index'] = test_index result['x_index'] = train_index result['auc'] = auc return result
def _grid_search(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, colsample_bytree): _, y_hat, _, _ = self._launch_xgboost(x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, colsample_bytree) res = utils.evaluate_prediction(y_test, y_hat) return res['balanced_accuracy']
def _grid_search(self, kernel_train, x_test, y_train, y_test, c): _, y_hat, _, _ = self._launch_svc(kernel_train, x_test, y_train, y_test, c) res = utils.evaluate_prediction(y_test, y_hat) return res['balanced_accuracy']
def _grid_search(self, x_train, x_test, y_train, y_test, c): _, y_hat, _ = self._launch_logistic_reg(x_train, x_test, y_train, y_test, c) res = utils.evaluate_prediction(y_test, y_hat) return res['balanced_accuracy']
def _grid_search(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, max_features): _, y_hat, _, _ = self._launch_random_forest(x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, max_features) res = utils.evaluate_prediction(y_test, y_hat) return res['balanced_accuracy']
def inner_grid_search(kernel_train, x_test, y_train, y_test, c, balanced=False): y_hat, _ = launch_svc(kernel_train, x_test, y_train, y_test, c, balanced) res = evaluate_prediction(y_test, y_hat) return res['balanced_accuracy']
def evaluate(self, train_index, test_index): inner_pool = ThreadPool(self._n_threads) async_result = {} for i in range(self._grid_search_folds): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] x_train_inner = x_train[inner_train_index] x_test_inner = x_train[inner_test_index] y_train_inner = y_train[inner_train_index] y_test_inner = y_train[inner_test_index] for c in self._c_range: async_result[i][c] = inner_pool.apply_async( self._grid_search, (x_train_inner, x_test_inner, y_train_inner, y_test_inner, c)) inner_pool.close() inner_pool.join() best_parameter = self._select_best_parameter(async_result) x_test = self._x[test_index] y_test = self._y[test_index] _, y_hat, auc = self._launch_logistic_reg(x_train, x_test, y_train, y_test, best_parameter['c']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) result['y_hat'] = y_hat result['y'] = y_test result['y_index'] = test_index result['auc'] = auc return result
def outer_cross_validation(kernel_train, shared_x, train_indices, test_indices, x_test, y_train, y_test, async_res, balanced=False): best_c, best_acc = select_best_c(async_res) y_hat, auc = launch_svc(kernel_train, x_test, y_train, y_test, best_c, balanced, shared_x, train_indices, test_indices) result = dict() result['best_c'] = best_c result['best_acc'] = best_acc result['evaluation'] = evaluate_prediction(y_test, y_hat) result['y_hat'] = y_hat result['auc'] = auc return result
def _compute_average_test_accuracy(self, y_list, yhat_list): from clinica.pipelines.machine_learning.svm_utils import evaluate_prediction return evaluate_prediction(y_list, yhat_list)['balanced_accuracy']
tasks_dir, '%s_vs_%s_field_3T.tsv' % (task[0], task[1])), sep='\t') results = pd.io.parsers.read_csv(path.join(output_dir, '%s_vs_%s' % (task[0], task[1]), 'test_subjects.tsv'), sep='\t') results['Field_Strength'] = list( field.Field_Strength[results.subject_index]) print task results15 = results[results.Field_Strength == 1.5] print 'Mean accuracy 1.5T: ' print utils.evaluate_prediction(list(results15.y), list(results15.y_hat)) results3 = results[results.Field_Strength == 3] print 'Mean accuracy 3T: ' print utils.evaluate_prediction(list(results3.y), list(results3.y_hat)) # Population stats print 'ADNI 1.5T vs 3T population stats' path_bids = '/ADNI/BIDS' tasks_dir = '/ADNI/SUBJECTS/lists_by_task' for task in tasks: print task print 'Subjects 1.5T' dx15 = path.join(tasks_dir, '%s_vs_%s_field_1.5T.tsv' % (task[0], task[1])) population_stats(path_bids, dx15, 'ADNI')
def evaluate(self, train_index, test_index, top_k=50): inner_pool = ThreadPool(self._n_threads) async_result = {} for i in range(self._grid_search_folds): async_result[i] = {} ### feature rescaling if self._feature_rescaling_method == 'zscore': selector = StandardScaler(with_std=self._with_std) selector.fit(self._x[train_index]) x_after = selector.transform(self._x) elif self._feature_rescaling_method == 'minmax': selector = MinMaxScaler() selector.fit(self._x[train_index]) x_after = selector.transform(self._x) elif self._feature_rescaling_method == None: x_after = self._x else: raise Exception('Method has not been implemented') ## then do feautre selection if self._feature_selection_method == 'ANOVA': selector = SelectPercentile(f_classif, percentile=top_k) selector.fit(x_after[train_index], self._y[train_index]) x_after = selector.transform(x_after) elif self._feature_selection_method == 'RF': clf = RandomForestClassifier(n_estimators=250, random_state=0, n_jobs=-1) clf.fit(x_after[train_index], self._y[train_index]) selector = SelectFromModel(clf, threshold=top_k) selector.fit(x_after[train_index], self._y[train_index]) x_after = selector.transform(x_after) elif self._feature_selection_method == 'PCA': selector = PCA(n_components=top_k) selector.fit(x_after[train_index]) x_after = selector.transform(x_after) elif self._feature_selection_method == 'RFE': svc = SVR(kernel="linear") selector = RFE(estimator=svc, n_features_to_select=int( 0.01 * top_k * x_after[train_index].shape[1]), step=0.5) selector.fit(x_after[train_index], self._y[train_index]) x_after = selector.transform(x_after) self._kernel = utils.gram_matrix_linear(x_after) outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] inner_kernel = outer_kernel[ inner_train_index, :][:, inner_train_index] x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[ inner_test_index] for c in self._c_range: async_result[i][c] = inner_pool.apply_async( self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) #print i, c, async_result[i][c] inner_pool.close() inner_pool.join() best_parameter = self._select_best_parameter(async_result) x_test = self._kernel[test_index, :][:, train_index] y_train, y_test = self._y[train_index], self._y[test_index] _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test, y_train, y_test, best_parameter['c']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) result['evaluation_train'] = utils.evaluate_prediction( y_train, y_hat_train) result['y_hat'] = y_hat result['y_hat_train'] = y_hat_train result['y'] = y_test result['y_train'] = y_train result['y_index'] = test_index result['x_index'] = train_index result['auc'] = auc return result
adni_images.get_images(), mask=True) weights = np.loadtxt( path.join(adni_classifier_dir, 'weights.txt')) w = vbio.revert_mask(weights, adni_data_mask, adni_orig_shape).flatten() b = np.loadtxt(path.join(adni_classifier_dir, 'intersect.txt')) x = input_images.get_x() y = input_images.get_y() y_hat = np.dot(w, x.transpose()) + b y_binary = (y_hat > 0) * 1.0 evaluation = utils.evaluate_prediction(y, y_binary) auc = roc_auc_score(y, y_hat) evaluation['AUC'] = auc print evaluation del evaluation['confusion_matrix'] res_df = pd.DataFrame(evaluation, index=[ 'i', ]) res_df.to_csv(path.join(classification_dir, 'results_auc.tsv'), sep='\t')
def svm_binary_classification(input_image_atlas, subjects_visits_tsv, image_list, diagnosis_list, output_directory, kernel_function=None, existing_gram_matrix=None, mask_zeros=True, scale_data=False, balanced=False, outer_folds=10, inner_folds=10, n_threads=10, c_range=np.logspace(-10, 2, 1000), save_gram_matrix=False, save_subject_classification=False, save_dual_coefficients=False, scaler=None, data_mask=None, save_original_weights=False, save_features_image=True): if (kernel_function is None and existing_gram_matrix is None) | ( kernel_function is not None and existing_gram_matrix is not None): raise ValueError( 'Kernel_function and existing_gram_matrix are mutually exclusive parameters.' ) results = dict() dx_filter = np.unique(diagnosis_list) print 'Loading ' + str(len(image_list)) + ' subjects' x0 = load_data(image_list, subjects_visits_tsv) print 'Subjects loaded' if scale_data: x_all = scale(x0) else: x_all = x0 if existing_gram_matrix is None: if kernel_function is not None: print 'Calculating Gram matrix' gram_matrix = kernel_function(x_all) print 'Gram matrix calculated' else: raise ValueError( 'If a Gram matrix is not provided a function to calculate it (kernel_function) is a required input.' ) else: gram_matrix = existing_gram_matrix if (gram_matrix.shape[0] != gram_matrix.shape[1]) | ( gram_matrix.shape[0] != len(image_list)): raise ValueError( 'The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.' ) if save_gram_matrix: np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix) shared_x = sharedmem.copy(x_all) x_all = None gc.collect() for i in range(len(dx_filter)): for j in range(i + 1, len(dx_filter)): print j dx1 = dx_filter[i] dx2 = dx_filter[j] ind1 = [] ind2 = [] for k in range(len(diagnosis_list)): if diagnosis_list[k] == dx1: ind1.append(k) if diagnosis_list[k] == dx2: ind2.append(k) indices = ind1 + ind2 current_subjects = [image_list[k] for k in indices] current_diagnosis = [diagnosis_list[k] for k in indices] y = np.array([0] * len(ind1) + [1] * len(ind2)) gm = gram_matrix[indices, :][:, indices] classification_str = dx1 + '_vs_' + dx2 + ('_balanced' if balanced else '_not_balanced') print 'Running ' + dx1 + ' vs ' + dx2 + ' classification' y_hat, dual_coefficients, sv_indices, intersect, c, auc = cv_svm( gm, shared_x, np.array(indices), y, c_range, balanced=balanced, outer_folds=outer_folds, inner_folds=inner_folds, n_threads=n_threads) evaluation = evaluate_prediction(y, y_hat) evaluation['auc'] = auc print '\nTrue positive %0.2f' % len(evaluation['predictions'][0]) print 'True negative %0.2f' % len(evaluation['predictions'][1]) print 'False positive %0.2f' % len(evaluation['predictions'][2]) print 'False negative %0.2f' % len(evaluation['predictions'][3]) print 'AUC %0.2f' % auc print 'Accuracy %0.2f' % evaluation['accuracy'] print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy'] print 'Sensitivity %0.2f' % evaluation['sensitivity'] print 'Specificity %0.2f' % evaluation['specificity'] print 'Positive predictive value %0.2f' % evaluation['ppv'] print 'Negative predictive value %0.2f \n' % evaluation['npv'] if save_dual_coefficients: np.save( join(output_directory, classification_str + '__dual_coefficients'), dual_coefficients[0]) np.save( join(output_directory, classification_str + '__sv_indices'), sv_indices) np.save( join(output_directory, classification_str + '__intersect'), intersect) if save_original_weights or save_features_image: weights_orig = features_weights(current_subjects, dual_coefficients[0], sv_indices, scaler, data_mask) if save_original_weights: np.save( join(output_directory, classification_str + '__weights'), weights_orig) if save_features_image: output_image = weights_to_nifti(input_image_atlas, weights_orig) output_image.to_filename( join(output_directory, classification_str + '__weights.nii')) if save_subject_classification: save_subjects_prediction( current_subjects, current_diagnosis, y, y_hat, join(output_directory, classification_str + '__subjects.tsv')) results[(dx1, dx2)] = evaluation # evaluate_prediction(y, y_hat) results_to_tsv( results, dx_filter, join( output_directory, 'resume' + ('_balanced' if balanced else '_not_balanced') + '.tsv')) shared_x = None gc.collect()
def evaluate(self, train_index, test_index, top_k): inner_pool = ThreadPool(self._n_threads) async_result = {} for i in range(self._grid_search_folds): async_result[i] = {} if self._feature_selection_method == 'ANOVA': selector = SelectPercentile(f_classif, percentile=top_k) selector.fit(self._x[train_index], self._y[train_index]) elif self._feature_selection_method == 'RF': clf = RandomForestClassifier(n_estimators=250, random_state=0, n_jobs=-1) clf.fit(self._x[train_index], self._y[train_index]) selector = SelectFromModel(clf, threshold= top_k) selector.fit(self._x[train_index], self._y[train_index]) elif self._feature_selection_method == 'PCA': selector = PCA(n_components=top_k) selector.fit(self._x[train_index]) elif self._feature_selection_method == 'RFE': svc = SVR(kernel="linear") selector = RFE(estimator=svc, n_features_to_select=int(0.01 * top_k * self._x[train_index].shape[1]), step=0.5) selector.fit(self._x[train_index], self._y[train_index]) else: print('Method has not been implemented') x_after_fs = selector.transform(self._x) #indices_fs_train = selector.get_support() #x_after_fs = self._x[:, indices_fs_train] print 'In total, there are %d voxels in this task' % self._x[train_index].shape[1] print 'The threshold is %s' % str(top_k) print 'We select the %d most discriminative voxels' % x_after_fs.shape[1] self._kernel = utils.gram_matrix_linear(x_after_fs) outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] inner_kernel = outer_kernel[inner_train_index, :][:, inner_train_index] x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] for c in self._c_range: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) #print i, c, async_result[i][c] inner_pool.close() inner_pool.join() best_parameter = self._select_best_parameter(async_result) x_test = self._kernel[test_index, :][:, train_index] y_train, y_test = self._y[train_index], self._y[test_index] _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test, y_train, y_test, best_parameter['c']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) result['evaluation_train'] = utils.evaluate_prediction(y_train, y_hat_train) result['y_hat'] = y_hat result['y_hat_train'] = y_hat_train result['y'] = y_test result['y_train'] = y_train result['y_index'] = test_index result['x_index'] = train_index result['auc'] = auc return result