Example #1
0
    def tpot_model_analysis(random_seed, save_path):
        save_path = os.path.join(save_path, 'random_seed_%03d' %random_seed)
        print('Random seed: %03d' %random_seed)
        # Load the clean data for both the UKBIO and the BANC analysis
        # This version of the UKBIOBANK dataset contains the same columns as the BANC
        # dataset

        # Load the saved trained model
        tpot = joblib.load(os.path.join(save_path, 'tpot_%s_vanilla_combi_%03dgen.dump'
                                        %(args.dataset, args.generations)))
        exported_pipeline = tpot['fitted_pipeline']

        # Load the saved test dataset
        project_ukbio_wd, project_data_ukbio, _ = get_paths(args.debug, args.dataset)
        with open(os.path.join(save_path, 'splitted_dataset_%s.pickle' %args.dataset), 'rb') as handle:
            splitted_dataset = pickle.load(handle)

        # Print some results
        print('Print MAE - test')
        y_predicted_test = exported_pipeline.predict(splitted_dataset['Xtest_scaled'])
        mae_test = mean_absolute_error(splitted_dataset['Ytest'], y_predicted_test)
        print(mae_test)
        print('Print MAE - training')
        y_predicted_train = exported_pipeline.predict(splitted_dataset['Xtrain_scaled'])
        mae_train = mean_absolute_error(splitted_dataset['Ytrain'], y_predicted_train)
        print(mae_train)

        # plot predicted vs true for the test
        output_path_test = os.path.join(save_path, 'test_predicted_true_age.eps')
        plot_predicted_vs_true(splitted_dataset['Ytest'], y_predicted_test,
                                   output_path_test, 'Age')


        # Do some statistics. Calculate R2 and the Spearman
        from scipy.stats import spearmanr, pearsonr
        from sklearn.metrics import r2_score

        rho_test, rho_p_value_test = spearmanr(splitted_dataset['Ytest'],
                                     y_predicted_test)
        print('Statistics for the test dataset')
        print('shape of the dataset: %s' %(splitted_dataset['Ytest'].shape,))
        print('Rho and p-value: %.4f %.4f' %(rho_test, rho_p_value_test))

        r_score_test = r2_score(splitted_dataset['Ytest'], y_predicted_test)
        print('R2 is: %.4f' %r_score_test)

        r_test, r_p_value_test = pearsonr(splitted_dataset['Ytest'],
                                              y_predicted_test)
        print('R is: %.4f' %r_test)
        return mae_test, r_test
def rvc_analysis(random_seed, save_path):
    # Load the data
    # TODO: change the path
    save_path = os.path.join(save_path, 'random_seed_%03d' %random_seed)
    print('Random seed: %03d' %random_seed)
    # Load the saved validation dataset
    project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset)
    with open(os.path.join(save_path, 'splitted_dataset_%s.pickle' %dataset), 'rb') as handle:
            splitted_dataset = pickle.load(handle)

    # Train the model
    model = RVC(kernel='linear')
    model.fit(splitted_dataset['Xtrain_scaled'], splitted_dataset['Ytrain'])

    # make cross validated predictions
    print('Perform prediction in test data')
    y_prediction_test = model.predict(splitted_dataset['Xtest_scaled'])

    y_prediction_validation = model.predict(splitted_dataset['Xvalidate_scaled'])

    # -----------------------------------------------------------------------------
    # Do some statistics. Calculate the confusion matrix

    # Test dataset
    # Look at the confusion matrix for test data
    class_name = np.array(['young', 'old', 'adult'], dtype='U10')
    ax, cm_test = plot_confusion_matrix(splitted_dataset['Ytest'], y_prediction_test,
                          classes=class_name,
                          normalize=True)
    # Look at accuracy
    accuracy_test = accuracy_score(splitted_dataset['Ytest'], y_prediction_test)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps'))

   # Predict on the validation dataset
    ax, cm_validation = plot_confusion_matrix(splitted_dataset['Yvalidate'], y_prediction_validation,
                          classes=class_name,
                          normalize=True)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_validation_rvc.eps'))
    # Look at accuracy
    accuracy_val = accuracy_score(splitted_dataset['Yvalidate'],
                                   y_prediction_validation)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps'))
    return cm_test, cm_validation, accuracy_test, accuracy_val
Example #3
0
        'MaskVol.to.eTIV',
        'EstimatedTotalIntraMaCranialVol'
    ]
    # Check if you have all the features form the banc dataset
    assert (len(biobank_columns) == len(freesurfer_banc_columns))

    renameCols = dict(zip(biobank_columns, freesurfer_banc_columns))

    return lh_thickness, rh_thickness, renameCols


# Load both datasets
debug = False
resamplefactor = 1
save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess')
project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf')
project_banc_wd, project_data_banc, _ = get_paths(debug, 'BANC_freesurf')
_, _, freesurfer_df_banc = get_data(project_data_banc,
                                    'BANC_freesurf',
                                    debug,
                                    project_banc_wd,
                                    resamplefactor,
                                    raw=True,
                                    analysis=None)
_, _, freesurfer_df_ukbio = get_data(project_data_ukbio,
                                     'UKBIO_freesurf',
                                     debug,
                                     project_ukbio_wd,
                                     resamplefactor,
                                     raw=True,
                                     analysis=None)
def rvr_analysis(random_seed, save_path, n_folds, analysis):
    save_path = save_path / ('random_seed_%03d' % random_seed)
    print('Random seed: %03d' % random_seed)
    # Load the saved validation dataset
    project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset)
    with open(save_path / ('splitted_dataset_%s.pickle' % dataset),
              'rb') as handle:
        splitted_dataset = pickle.load(handle)

    kf = KFold(n_splits=n_folds, random_state=random_seed)
    mae_cv = np.zeros((n_folds, 1))
    pearsons_corr = np.zeros((n_folds, 1))
    pearsons_pval = np.zeros((n_folds, 1))

    # Set target and features
    x = splitted_dataset['Xtest_scaled']
    y = splitted_dataset['Ytest']

    t_time_train = []
    t_time_test = []

    for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)):
        x_train, x_test = x[train_idx, :], x[test_idx, :]
        y_train, y_test = y[train_idx], y[test_idx]

        print('CV iteration: %d' % (i_fold + 1))
        print('Shape of the trainig and test dataset')
        print(y_train.shape, y_test.shape)

        # train the model
        model = RVR(kernel='linear')
        cv_time_train = time.process_time()
        model.fit(x_train, y_train)
        elapsed_time = time.process_time() - cv_time_train
        print('CV - Elapased time in seconds to train:')
        t_time_train.append(elapsed_time)
        print('%.03f' % elapsed_time)

        # test the model
        cv_time_test = time.process_time()
        y_predicted = model.predict(x_test)
        elapsed_time = time.process_time() - cv_time_test
        t_time_test.append(elapsed_time)
        print('CV - Elapased time in seconds to test:')
        print('%.03f' % elapsed_time)

        mae_kfold = mean_absolute_error(y_test, y_predicted)
        mae_cv[i_fold, :] = mae_kfold
        # now look at the pearson's correlation
        r_test, r_p_value_test = pearsonr(y_test, y_predicted)
        pearsons_corr[i_fold, :] = r_test
        pearsons_pval[i_fold, :] = r_p_value_test

    print('CV results')
    print('MAE: Mean(SD) = %.3f(%.3f)' % (mae_cv.mean(), mae_cv.std()))
    print('Pearson\'s Correlation: Mean(SD) = %.3f(%.3f)' %
          (r_test.mean(), r_test.std()))
    print('Mean CV time: %.3f s ' % np.mean(t_time_train))
    print('SD CV time: %.3f s' % np.std(t_time_train))
    print('Mean CV time: %.3f s ' % np.mean(t_time_test))
    print('SD CV time: %.3f s' % np.std(t_time_test))
    print('')

    if analysis == 'vanilla_combi':
        # Train the entire dataset
        x_train_all, x_test_all, y_train_all, y_test_all = \
                train_test_split(x, y, test_size=.85, random_state=random_seed)
        print('All: Shape of the trainig and test dataset')
        print(y_train_all.shape, y_test_all.shape)
    elif analysis == 'uniform_dist':
        # Train the entire dataset
        x_train_all, x_test_all, y_train_all, y_test_all = \
                train_test_split(x, y, test_size=.20,  random_state=random_seed)
        print('ALL: Shape of the trainig and test dataset')
        print(y_train_all.shape, y_test_all.shape)
    print('Training RVR model:')
    model_all = RVR(kernel='linear')
    model_all.fit(x_train_all, y_train_all)
    # plot predicted vs true for the test (Entire sample)
    print('Plotting Predicted Vs True Age for all the sample')
    y_predicted_test = model.predict(x_test_all)
    output_path_test = save_path / (
        'rvr_test_predicted_true_age_rnd_seed%d.eps' % random_seed)
    plot_predicted_vs_true(y_test_all, y_predicted_test, output_path_test,
                           'Age')

    return mae_cv, r_test, t_time_train, t_time_test
            from BayOptPy.tpot.dicts.gpr_tpot_config_feat_selec import tpot_config_gpr
            tpot_config = tpot_config_gpr
        elif args.config_dict == 'feat_combi':
            # Load models for feature combination
            from BayOptPy.tpot.dicts.gpr_tpot_config_feat_combi import tpot_config_gpr
            tpot_config = tpot_config_gpr
        elif args.config_dict == 'vanilla_combi':
            # Load models for feature combination
            from BayOptPy.tpot.dicts.gpr_tpot_config_vanilla_combi import tpot_config_gpr
            tpot_config = tpot_config_gpr

    print('-----------------------------------------------------------------')
    print('Get datapaths:')
    print('-----------------------------------------------------------------')
    # Get data paths, the actual data and check if the output paths exists
    project_wd, project_data, project_sink = get_paths(args.debug,
                                                       args.dataset)
    output_path = get_output_path(args.model, args.analysis, args.generations,
                                  args.random_seed, args.population_size,
                                  args.debug, args.mutation_rate,
                                  args.crossover_rate,
                                  args.predicted_attribute)
    # Load the already cleaned dataset
    demographics, imgs, dataframe = get_data(project_data,
                                             args.dataset,
                                             args.debug,
                                             project_wd,
                                             args.resamplefactor,
                                             raw=str_to_bool(args.raw),
                                             analysis=args.analysis)
    print('Using %d features' % len(dataframe.columns))
Example #6
0
dataset = 'freesurf_combined'
debug = False
resamplefactor = 1
random_seed = 20
save_path = '/code/BayOptPy/tpot/Output/vanilla_combi/100_generations/random_seed_020/'
# Load the clean data for both the UKBIO and the BANC analysis
# This version of the UKBIOBANK dataset contains the same columns as the BANC
# dataset

# Load the saved trained model
tpot = joblib.load(
    os.path.join(save_path, 'tpot_%s_vanilla_combi_100gen.dump' % dataset))
exported_pipeline = tpot['fitted_pipeline']

# Load the saved validation dataset
project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset)
with open(os.path.join(save_path, 'splitted_dataset.pickle'), 'rb') as handle:
    splitted_dataset = pickle.load(handle)

# Print some results
print('Print MAE - test')
y_predicted_test = exported_pipeline.predict(splitted_dataset['Xtest_scaled'])
mae = mean_absolute_error(splitted_dataset['Ytest'], y_predicted_test)
print(mae)
print('Print MAE - training')
y_predicted_train = exported_pipeline.predict(
    splitted_dataset['Xtrain_scaled'])
mae_train = mean_absolute_error(splitted_dataset['Ytrain'], y_predicted_train)
print(mae_train)
print('Print MAE - validation')
y_predicted_validation = exported_pipeline.predict(
Example #7
0
    if s == 'True':
        return True
    elif s == 'False':
        return False


#-----------------------------------------------------------------------------
# Settings
#-----------------------------------------------------------------------------
debug = False
dataset = 'freesurf_combined'
resamplefactor = 1
save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess')
raw = 'False'
analysis = 'uniform'
project_wd, project_data, project_sink = get_paths(debug, dataset)

demographics, imgs, dataframe = get_data(project_data,
                                         dataset,
                                         debug,
                                         project_wd,
                                         resamplefactor,
                                         raw=str_to_bool(raw),
                                         analysis=analysis)

# transform age into ints
demographics['age_int'] = demographics['age'].astype('int32', copy=False)

# Select 14 subjects for all ages that have 14 representatives.
age_range = np.arange(demographics['age'].min(), demographics['age'].max())
# remove entry where you don't have 14 subjects
cwd = os.getcwd()
best_pipe_paths = os.path.join(cwd, 'BayOptPy/tpot')
# create a directory where to cache the results
tpot = ExtendedTPOTRegressor(generations=5,
                     population_size=50,
                     verbosity=2,
                     random_state=42,
                     config_dict=tpot_config,
                     periodic_checkpoint_folder=best_pipe_paths,
                     scoring=scoring
                     )
tpot.fit(X_train, y_train, X_test, y_test)
print('Test score using optimal model: %f ' %tpot.score(X_test, y_test))

# get paths of where to save the files
project_wd, _, _ = get_paths(args.debug, dataset)
tpot.export(os.path.join(project_wd, 'BayOptPy', 'tpot', 'debug',
                        'tpot_boston_pipeline_super.py'))

# Do some preprocessing to find models where all predictions have the same value and eliminate them, as those will correspond
# to NaN entries or very small numbers on the correlation matrix.
repeated_idx = np.argwhere([np.array_equal(np.repeat(tpot.predictions[i][0], len(tpot.predictions[i])), tpot.predictions[i]) for i in range(len(tpot.predictions))])
print('Index of the models with the same prediction for all subjects: ' + str(np.squeeze(repeated_idx)))
print('Number of models analysed: %d' %len(tpot.predictions))
tpot_predictions = np.delete(np.array(tpot.predictions), np.squeeze(repeated_idx), axis=0)
print('Number of models that will be used for cross-correlation: %s' %(tpot_predictions.shape,))

# Cross correlate the predictions
corr_matrix = np.corrcoef(tpot_predictions)

print('Check the number of NaNs after deleting models with constant predictions: %d' %len(np.argwhere(np.isnan(corr_matrix))))
Example #9
0
"""
This script tests the best model recommened by the combined dataset (UKBIO +
BANC) for 100 generations, random
seed 20, initial population 1000, mutation rate and cross-validation rate 0.9
and cross-over 0.1
"""

# General Settings
#-------------------------------------------------------------------------------
debug = False
resamplefactor = 1
random_seed = 20
save_path = '/code/BayOptPy/tpot/Output/random_seed/100_generations/random_seed_%03d/' %(random_seed)
# Load the combined dataset
project_wd, project_data, _ = get_paths(debug, 'freesurf_combined')
demographics, _, df_data =  \
             get_data(project_data, 'freesurf_combined', debug,
                      project_wd, resamplefactor, raw=False, analysis=None)
# Drop the last column that corresponds the name of the dataset
df_data = df_data.drop('dataset', axis=1)
#-------------------------------------------------------------------------------
# Train the model with BANC
#-------------------------------------------------------------------------------
targetAttribute = demographics[['age']]
demographics = demographics.set_index('id')

# Add a few of the BIOBANK Dataset into the training set
Xtrain, Xtemp, Ytrain, Ytemp = train_test_split(df_data, targetAttribute,
                                                test_size=.90,
                                                stratify=demographics['stratify'],