Exemple #1
0
    def evaluate_no_cv(self, train_index, test_index):

        x_train = self._x[train_index]
        y_train = self._y[train_index]
        x_test = self._x[test_index]
        y_test = self._y[test_index]

        best_parameter = dict()
        best_parameter['n_estimators'] = self._n_estimators_range
        best_parameter['max_depth'] = self._max_depth_range
        best_parameter['min_samples_split'] = self._min_samples_split_range
        best_parameter['max_features'] = self._max_features_range

        _, y_hat, auc, y_hat_train = self._launch_random_forest(
            x_train, x_test, y_train, y_test, self._n_estimators_range,
            self._max_depth_range, self._min_samples_split_range,
            self._max_features_range)
        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        best_parameter['balanced_accuracy'] = result['evaluation'][
            'balanced_accuracy']
        result['evaluation_train'] = utils.evaluate_prediction(
            y_train, y_hat_train)
        result['y_hat'] = y_hat
        result['y_hat_train'] = y_hat_train
        result['y'] = y_test
        result['y_train'] = y_train
        result['y_index'] = test_index
        result['x_index'] = train_index
        result['auc'] = auc

        return result
Exemple #2
0
    def evaluate(self, train_index, test_index):

        inner_pool = ThreadPool(self._n_threads)
        async_result = {}
        for i in range(self._grid_search_folds):
            async_result[i] = {}

        x_train = self._x[train_index]
        y_train = self._y[train_index]

        skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True)
        inner_cv = list(skf.split(np.zeros(len(y_train)), y_train))

        parameters_combinations = list(
            itertools.product(self._n_estimators_range, self._max_depth_range,
                              self._min_samples_split_range,
                              self._max_features_range))

        for i in range(len(inner_cv)):
            inner_train_index, inner_test_index = inner_cv[i]

            x_train_inner = x_train[inner_train_index]
            x_test_inner = x_train[inner_test_index]
            y_train_inner = y_train[inner_train_index]
            y_test_inner = y_train[inner_test_index]

            for parameters in parameters_combinations:
                async_result[i][parameters] = inner_pool.apply_async(
                    self._grid_search,
                    (x_train_inner, x_test_inner, y_train_inner, y_test_inner,
                     parameters[0], parameters[1], parameters[2],
                     parameters[3]))
        inner_pool.close()
        inner_pool.join()
        best_parameter = self._select_best_parameter(async_result)
        x_test = self._x[test_index]
        y_test = self._y[test_index]

        _, y_hat, auc, y_hat_train = self._launch_random_forest(
            x_train, x_test, y_train, y_test, best_parameter['n_estimators'],
            best_parameter['max_depth'], best_parameter['min_samples_split'],
            best_parameter['max_features'])

        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        result['evaluation_train'] = utils.evaluate_prediction(
            y_train, y_hat_train)
        result['y_hat'] = y_hat
        result['y_hat_train'] = y_hat_train
        result['y'] = y_test
        result['y_train'] = y_train
        result['y_index'] = test_index
        result['x_index'] = train_index
        result['auc'] = auc

        return result
Exemple #3
0
    def evaluate(self, train_index, test_index):

        inner_pool = ThreadPool(self._n_threads)
        async_result = {}
        for i in range(self._grid_search_folds):
            async_result[i] = {}

        outer_kernel = self._kernel[train_index, :][:, train_index]
        y_train = self._y[train_index]

        skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True)
        inner_cv = list(skf.split(np.zeros(len(y_train)), y_train))

        for i in range(len(inner_cv)):
            inner_train_index, inner_test_index = inner_cv[i]

            inner_kernel = outer_kernel[
                inner_train_index, :][:, inner_train_index]
            x_test_inner = outer_kernel[inner_test_index, :][:,
                                                             inner_train_index]
            y_train_inner, y_test_inner = y_train[inner_train_index], y_train[
                inner_test_index]

            for c in self._c_range:
                print("Inner CV for C=%f... \n" % c)
                async_result[i][c] = inner_pool.apply_async(
                    self._grid_search,
                    args=(inner_kernel, x_test_inner, y_train_inner,
                          y_test_inner, c))
                #print i, c, async_result[i][c]
        inner_pool.close()
        inner_pool.join()

        best_parameter = self._select_best_parameter(async_result)
        x_test = self._kernel[test_index, :][:, train_index]
        y_train, y_test = self._y[train_index], self._y[test_index]

        _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test,
                                                      y_train, y_test,
                                                      best_parameter['c'])

        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        result['evaluation_train'] = utils.evaluate_prediction(
            y_train, y_hat_train)
        result['y_hat'] = y_hat
        result['y_hat_train'] = y_hat_train
        result['y'] = y_test
        result['y_train'] = y_train
        result['y_index'] = test_index
        result['x_index'] = train_index
        result['auc'] = auc

        return result
Exemple #4
0
    def _grid_search(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, colsample_bytree):

        _, y_hat, _, _ = self._launch_xgboost(x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators,
                                              colsample_bytree)
        res = utils.evaluate_prediction(y_test, y_hat)

        return res['balanced_accuracy']
Exemple #5
0
    def _grid_search(self, kernel_train, x_test, y_train, y_test, c):

        _, y_hat, _, _ = self._launch_svc(kernel_train, x_test, y_train,
                                          y_test, c)
        res = utils.evaluate_prediction(y_test, y_hat)

        return res['balanced_accuracy']
Exemple #6
0
    def _grid_search(self, x_train, x_test, y_train, y_test, c):

        _, y_hat, _ = self._launch_logistic_reg(x_train, x_test, y_train,
                                                y_test, c)
        res = utils.evaluate_prediction(y_test, y_hat)

        return res['balanced_accuracy']
Exemple #7
0
 def _grid_search(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, max_features):
 
     _, y_hat, _, _ = self._launch_random_forest(x_train, x_test, y_train, y_test,
                                                 n_estimators, max_depth,
                                                 min_samples_split, max_features)
     res = utils.evaluate_prediction(y_test, y_hat)
     
     return res['balanced_accuracy']
Exemple #8
0
def inner_grid_search(kernel_train,
                      x_test,
                      y_train,
                      y_test,
                      c,
                      balanced=False):

    y_hat, _ = launch_svc(kernel_train, x_test, y_train, y_test, c, balanced)
    res = evaluate_prediction(y_test, y_hat)

    return res['balanced_accuracy']
Exemple #9
0
    def evaluate(self, train_index, test_index):

        inner_pool = ThreadPool(self._n_threads)
        async_result = {}
        for i in range(self._grid_search_folds):
            async_result[i] = {}

        x_train = self._x[train_index]
        y_train = self._y[train_index]

        skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True)
        inner_cv = list(skf.split(np.zeros(len(y_train)), y_train))

        for i in range(len(inner_cv)):
            inner_train_index, inner_test_index = inner_cv[i]

            x_train_inner = x_train[inner_train_index]
            x_test_inner = x_train[inner_test_index]
            y_train_inner = y_train[inner_train_index]
            y_test_inner = y_train[inner_test_index]

            for c in self._c_range:
                async_result[i][c] = inner_pool.apply_async(
                    self._grid_search, (x_train_inner, x_test_inner,
                                        y_train_inner, y_test_inner, c))
        inner_pool.close()
        inner_pool.join()

        best_parameter = self._select_best_parameter(async_result)
        x_test = self._x[test_index]
        y_test = self._y[test_index]

        _, y_hat, auc = self._launch_logistic_reg(x_train, x_test, y_train,
                                                  y_test, best_parameter['c'])

        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        result['y_hat'] = y_hat
        result['y'] = y_test
        result['y_index'] = test_index
        result['auc'] = auc

        return result
Exemple #10
0
def outer_cross_validation(kernel_train,
                           shared_x,
                           train_indices,
                           test_indices,
                           x_test,
                           y_train,
                           y_test,
                           async_res,
                           balanced=False):

    best_c, best_acc = select_best_c(async_res)

    y_hat, auc = launch_svc(kernel_train, x_test, y_train, y_test, best_c,
                            balanced, shared_x, train_indices, test_indices)

    result = dict()
    result['best_c'] = best_c
    result['best_acc'] = best_acc
    result['evaluation'] = evaluate_prediction(y_test, y_hat)
    result['y_hat'] = y_hat
    result['auc'] = auc

    return result
Exemple #11
0
    def _compute_average_test_accuracy(self, y_list, yhat_list):

        from clinica.pipelines.machine_learning.svm_utils import evaluate_prediction

        return evaluate_prediction(y_list, yhat_list)['balanced_accuracy']
Exemple #12
0
        tasks_dir, '%s_vs_%s_field_3T.tsv' % (task[0], task[1])),
                                 sep='\t')

    results = pd.io.parsers.read_csv(path.join(output_dir,
                                               '%s_vs_%s' % (task[0], task[1]),
                                               'test_subjects.tsv'),
                                     sep='\t')

    results['Field_Strength'] = list(
        field.Field_Strength[results.subject_index])

    print task
    results15 = results[results.Field_Strength == 1.5]

    print 'Mean accuracy 1.5T: '
    print utils.evaluate_prediction(list(results15.y), list(results15.y_hat))

    results3 = results[results.Field_Strength == 3]
    print 'Mean accuracy 3T: '
    print utils.evaluate_prediction(list(results3.y), list(results3.y_hat))

# Population stats
print 'ADNI 1.5T vs 3T population stats'
path_bids = '/ADNI/BIDS'
tasks_dir = '/ADNI/SUBJECTS/lists_by_task'

for task in tasks:
    print task
    print 'Subjects 1.5T'
    dx15 = path.join(tasks_dir, '%s_vs_%s_field_1.5T.tsv' % (task[0], task[1]))
    population_stats(path_bids, dx15, 'ADNI')
Exemple #13
0
    def evaluate(self, train_index, test_index, top_k=50):

        inner_pool = ThreadPool(self._n_threads)
        async_result = {}
        for i in range(self._grid_search_folds):
            async_result[i] = {}

        ### feature rescaling
        if self._feature_rescaling_method == 'zscore':
            selector = StandardScaler(with_std=self._with_std)
            selector.fit(self._x[train_index])
            x_after = selector.transform(self._x)
        elif self._feature_rescaling_method == 'minmax':
            selector = MinMaxScaler()
            selector.fit(self._x[train_index])
            x_after = selector.transform(self._x)
        elif self._feature_rescaling_method == None:
            x_after = self._x
        else:
            raise Exception('Method has not been implemented')

        ## then do feautre selection
        if self._feature_selection_method == 'ANOVA':
            selector = SelectPercentile(f_classif, percentile=top_k)
            selector.fit(x_after[train_index], self._y[train_index])
            x_after = selector.transform(x_after)
        elif self._feature_selection_method == 'RF':
            clf = RandomForestClassifier(n_estimators=250,
                                         random_state=0,
                                         n_jobs=-1)
            clf.fit(x_after[train_index], self._y[train_index])
            selector = SelectFromModel(clf, threshold=top_k)
            selector.fit(x_after[train_index], self._y[train_index])
            x_after = selector.transform(x_after)
        elif self._feature_selection_method == 'PCA':
            selector = PCA(n_components=top_k)
            selector.fit(x_after[train_index])
            x_after = selector.transform(x_after)
        elif self._feature_selection_method == 'RFE':
            svc = SVR(kernel="linear")
            selector = RFE(estimator=svc,
                           n_features_to_select=int(
                               0.01 * top_k * x_after[train_index].shape[1]),
                           step=0.5)
            selector.fit(x_after[train_index], self._y[train_index])
            x_after = selector.transform(x_after)

        self._kernel = utils.gram_matrix_linear(x_after)

        outer_kernel = self._kernel[train_index, :][:, train_index]
        y_train = self._y[train_index]

        skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True)
        inner_cv = list(skf.split(np.zeros(len(y_train)), y_train))

        for i in range(len(inner_cv)):
            inner_train_index, inner_test_index = inner_cv[i]

            inner_kernel = outer_kernel[
                inner_train_index, :][:, inner_train_index]
            x_test_inner = outer_kernel[inner_test_index, :][:,
                                                             inner_train_index]
            y_train_inner, y_test_inner = y_train[inner_train_index], y_train[
                inner_test_index]

            for c in self._c_range:
                async_result[i][c] = inner_pool.apply_async(
                    self._grid_search, (inner_kernel, x_test_inner,
                                        y_train_inner, y_test_inner, c))
                #print i, c, async_result[i][c]
        inner_pool.close()
        inner_pool.join()

        best_parameter = self._select_best_parameter(async_result)
        x_test = self._kernel[test_index, :][:, train_index]
        y_train, y_test = self._y[train_index], self._y[test_index]

        _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test,
                                                      y_train, y_test,
                                                      best_parameter['c'])

        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        result['evaluation_train'] = utils.evaluate_prediction(
            y_train, y_hat_train)
        result['y_hat'] = y_hat
        result['y_hat_train'] = y_hat_train
        result['y'] = y_test
        result['y_train'] = y_train
        result['y_index'] = test_index
        result['x_index'] = train_index
        result['auc'] = auc

        return result
                    adni_images.get_images(), mask=True)

                weights = np.loadtxt(
                    path.join(adni_classifier_dir, 'weights.txt'))
                w = vbio.revert_mask(weights, adni_data_mask,
                                     adni_orig_shape).flatten()

                b = np.loadtxt(path.join(adni_classifier_dir, 'intersect.txt'))

                x = input_images.get_x()
                y = input_images.get_y()

                y_hat = np.dot(w, x.transpose()) + b

                y_binary = (y_hat > 0) * 1.0

                evaluation = utils.evaluate_prediction(y, y_binary)

                auc = roc_auc_score(y, y_hat)
                evaluation['AUC'] = auc

                print evaluation

                del evaluation['confusion_matrix']

                res_df = pd.DataFrame(evaluation, index=[
                    'i',
                ])
                res_df.to_csv(path.join(classification_dir, 'results_auc.tsv'),
                              sep='\t')
Exemple #15
0
def svm_binary_classification(input_image_atlas,
                              subjects_visits_tsv,
                              image_list,
                              diagnosis_list,
                              output_directory,
                              kernel_function=None,
                              existing_gram_matrix=None,
                              mask_zeros=True,
                              scale_data=False,
                              balanced=False,
                              outer_folds=10,
                              inner_folds=10,
                              n_threads=10,
                              c_range=np.logspace(-10, 2, 1000),
                              save_gram_matrix=False,
                              save_subject_classification=False,
                              save_dual_coefficients=False,
                              scaler=None,
                              data_mask=None,
                              save_original_weights=False,
                              save_features_image=True):

    if (kernel_function is None and existing_gram_matrix is None) | (
            kernel_function is not None and existing_gram_matrix is not None):
        raise ValueError(
            'Kernel_function and existing_gram_matrix are mutually exclusive parameters.'
        )

    results = dict()
    dx_filter = np.unique(diagnosis_list)

    print 'Loading ' + str(len(image_list)) + ' subjects'
    x0 = load_data(image_list, subjects_visits_tsv)
    print 'Subjects loaded'
    if scale_data:
        x_all = scale(x0)
    else:
        x_all = x0

    if existing_gram_matrix is None:
        if kernel_function is not None:
            print 'Calculating Gram matrix'
            gram_matrix = kernel_function(x_all)
            print 'Gram matrix calculated'
        else:
            raise ValueError(
                'If a Gram matrix is not provided a function to calculate it (kernel_function) is a required input.'
            )
    else:
        gram_matrix = existing_gram_matrix
        if (gram_matrix.shape[0] != gram_matrix.shape[1]) | (
                gram_matrix.shape[0] != len(image_list)):
            raise ValueError(
                'The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.'
            )

    if save_gram_matrix:
        np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix)

    shared_x = sharedmem.copy(x_all)
    x_all = None
    gc.collect()

    for i in range(len(dx_filter)):
        for j in range(i + 1, len(dx_filter)):
            print j
            dx1 = dx_filter[i]
            dx2 = dx_filter[j]

            ind1 = []
            ind2 = []
            for k in range(len(diagnosis_list)):
                if diagnosis_list[k] == dx1:
                    ind1.append(k)
                if diagnosis_list[k] == dx2:
                    ind2.append(k)

            indices = ind1 + ind2

            current_subjects = [image_list[k] for k in indices]
            current_diagnosis = [diagnosis_list[k] for k in indices]

            y = np.array([0] * len(ind1) + [1] * len(ind2))
            gm = gram_matrix[indices, :][:, indices]

            classification_str = dx1 + '_vs_' + dx2 + ('_balanced' if balanced
                                                       else '_not_balanced')
            print 'Running ' + dx1 + ' vs ' + dx2 + ' classification'

            y_hat, dual_coefficients, sv_indices, intersect, c, auc = cv_svm(
                gm,
                shared_x,
                np.array(indices),
                y,
                c_range,
                balanced=balanced,
                outer_folds=outer_folds,
                inner_folds=inner_folds,
                n_threads=n_threads)

            evaluation = evaluate_prediction(y, y_hat)
            evaluation['auc'] = auc

            print '\nTrue positive %0.2f' % len(evaluation['predictions'][0])
            print 'True negative %0.2f' % len(evaluation['predictions'][1])
            print 'False positive %0.2f' % len(evaluation['predictions'][2])
            print 'False negative %0.2f' % len(evaluation['predictions'][3])

            print 'AUC %0.2f' % auc
            print 'Accuracy %0.2f' % evaluation['accuracy']
            print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy']
            print 'Sensitivity %0.2f' % evaluation['sensitivity']
            print 'Specificity %0.2f' % evaluation['specificity']
            print 'Positive predictive value %0.2f' % evaluation['ppv']
            print 'Negative predictive value %0.2f \n' % evaluation['npv']

            if save_dual_coefficients:
                np.save(
                    join(output_directory,
                         classification_str + '__dual_coefficients'),
                    dual_coefficients[0])
                np.save(
                    join(output_directory,
                         classification_str + '__sv_indices'), sv_indices)
                np.save(
                    join(output_directory, classification_str + '__intersect'),
                    intersect)

            if save_original_weights or save_features_image:
                weights_orig = features_weights(current_subjects,
                                                dual_coefficients[0],
                                                sv_indices, scaler, data_mask)

            if save_original_weights:
                np.save(
                    join(output_directory, classification_str + '__weights'),
                    weights_orig)

            if save_features_image:
                output_image = weights_to_nifti(input_image_atlas,
                                                weights_orig)
                output_image.to_filename(
                    join(output_directory,
                         classification_str + '__weights.nii'))

            if save_subject_classification:
                save_subjects_prediction(
                    current_subjects, current_diagnosis, y, y_hat,
                    join(output_directory,
                         classification_str + '__subjects.tsv'))

            results[(dx1, dx2)] = evaluation  # evaluate_prediction(y, y_hat)

    results_to_tsv(
        results, dx_filter,
        join(
            output_directory, 'resume' +
            ('_balanced' if balanced else '_not_balanced') + '.tsv'))
    shared_x = None
    gc.collect()
Exemple #16
0
    def evaluate(self, train_index, test_index, top_k):

        inner_pool = ThreadPool(self._n_threads)
        async_result = {}
        for i in range(self._grid_search_folds):
            async_result[i] = {}
	
	if self._feature_selection_method == 'ANOVA':
                selector = SelectPercentile(f_classif, percentile=top_k)
                selector.fit(self._x[train_index], self._y[train_index])
        elif self._feature_selection_method == 'RF':
                clf = RandomForestClassifier(n_estimators=250, random_state=0, n_jobs=-1)
                clf.fit(self._x[train_index], self._y[train_index])
                selector = SelectFromModel(clf, threshold= top_k)
                selector.fit(self._x[train_index], self._y[train_index])
        elif self._feature_selection_method == 'PCA':
                selector = PCA(n_components=top_k)
                selector.fit(self._x[train_index])
        elif self._feature_selection_method == 'RFE':
		svc = SVR(kernel="linear")
		selector = RFE(estimator=svc, n_features_to_select=int(0.01 * top_k * self._x[train_index].shape[1]), step=0.5)
		selector.fit(self._x[train_index], self._y[train_index])
	else:
                print('Method has not been implemented')

	x_after_fs = selector.transform(self._x)	
        #indices_fs_train = selector.get_support()
        #x_after_fs = self._x[:, indices_fs_train]
	print 'In total, there are %d voxels in this task' % self._x[train_index].shape[1]
        print 'The threshold is %s' % str(top_k)
        print 'We select the %d most discriminative voxels' % x_after_fs.shape[1]

        self._kernel = utils.gram_matrix_linear(x_after_fs)

        outer_kernel = self._kernel[train_index, :][:, train_index]
        y_train = self._y[train_index]

        skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True)
        inner_cv = list(skf.split(np.zeros(len(y_train)), y_train))

        for i in range(len(inner_cv)):
            inner_train_index, inner_test_index = inner_cv[i]

            inner_kernel = outer_kernel[inner_train_index, :][:, inner_train_index]
            x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index]
            y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index]

            for c in self._c_range:
                async_result[i][c] = inner_pool.apply_async(self._grid_search,
                                                            (inner_kernel, x_test_inner,
                                                             y_train_inner, y_test_inner, c))
                #print i, c, async_result[i][c]
        inner_pool.close()
        inner_pool.join()

        best_parameter = self._select_best_parameter(async_result)
        x_test = self._kernel[test_index, :][:, train_index]
        y_train, y_test = self._y[train_index], self._y[test_index]

        _, y_hat, auc, y_hat_train = self._launch_svc(outer_kernel, x_test, y_train, y_test, best_parameter['c'])

        result = dict()
        result['best_parameter'] = best_parameter
        result['evaluation'] = utils.evaluate_prediction(y_test, y_hat)
        result['evaluation_train'] = utils.evaluate_prediction(y_train, y_hat_train)
        result['y_hat'] = y_hat
        result['y_hat_train'] = y_hat_train
        result['y'] = y_test
        result['y_train'] = y_train
        result['y_index'] = test_index
        result['x_index'] = train_index
        result['auc'] = auc

        return result