Ejemplo n.º 1
0
    renameCols = dict(zip(biobank_columns, freesurfer_banc_columns))

    return lh_thickness, rh_thickness, renameCols


# Load both datasets
debug = False
resamplefactor = 1
save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess')
project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf')
project_banc_wd, project_data_banc, _ = get_paths(debug, 'BANC_freesurf')
_, _, freesurfer_df_banc = get_data(project_data_banc,
                                    'BANC_freesurf',
                                    debug,
                                    project_banc_wd,
                                    resamplefactor,
                                    raw=True,
                                    analysis=None)
_, _, freesurfer_df_ukbio = get_data(project_data_ukbio,
                                     'UKBIO_freesurf',
                                     debug,
                                     project_ukbio_wd,
                                     resamplefactor,
                                     raw=True,
                                     analysis=None)

# checM`k the columns between both datasets
# First Maprint the size of dataset
print('shape of the banc dataset; shape of the ukbio dataset')
print(freesurfer_df_banc.shape, freesurfer_df_ukbio.shape)
seed 20, initial population 1000, mutation rate and cross-validation ratexxx
"""

set_publication_style()
# General Settings
#-------------------------------------------------------------------------------
debug = False
resamplefactor = 1
random_seed = 20
save_path = '/code/BayOptPy/'
# Load the clean data for both the UKBIO and the BANC analysis
# This version of the UKBIOBANK dataset contains the same columns as the BANC
# dataset
project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf')
_, _, df_ukbio =  \
             get_data(project_data_ukbio, 'UKBIO_freesurf', debug,
                      project_ukbio_wd, resamplefactor, raw=False, analysis=None)
df_ukbio = df_ukbio.set_index('id')
# Drop the last column that corresponds the name of the dataset
df_ukbio = df_ukbio.drop('dataset', axis=1)

project_banc_wd, project_banc_data, _ = get_paths(debug, 'BANC_freesurf')
demographics_banc, __, df_banc = get_data(project_banc_data,
                                          'BANC_freesurf',
                                          debug,
                                          project_banc_wd,
                                          resamplefactor,
                                          raw=False,
                                          analysis=None)
# Drop the last column that corresponds the name of the dataset
df_banc = df_banc.drop('dataset', axis=1)
    print('-----------------------------------------------------------------')
    print('Get datapaths:')
    print('-----------------------------------------------------------------')
    # Get data paths, the actual data and check if the output paths exists
    project_wd, project_data, project_sink = get_paths(args.debug,
                                                       args.dataset)
    output_path = get_output_path(args.model, args.analysis, args.generations,
                                  args.random_seed, args.population_size,
                                  args.debug, args.mutation_rate,
                                  args.crossover_rate,
                                  args.predicted_attribute)
    # Load the already cleaned dataset
    demographics, imgs, dataframe = get_data(project_data,
                                             args.dataset,
                                             args.debug,
                                             project_wd,
                                             args.resamplefactor,
                                             raw=str_to_bool(args.raw),
                                             analysis=args.analysis)
    print('Using %d features' % len(dataframe.columns))

    # If we are looking at the uniform distribution, get the corresponding
    # dataset
    if args.analysis == 'uniform_dist':
        demographics, dataframe = get_uniform_dist_data(
            args.debug, args.dataset, args.resamplefactor,
            str_to_bool(args.raw), args.analysis)

    if args.dataset == 'freesurf_combined' or args.dataset == 'UKBIO_freesurf':
        # Drop the last coumn which correspond to the dataset name
        dataframe = dataframe.drop(['dataset'], axis=1)
Ejemplo n.º 4
0
#-----------------------------------------------------------------------------
# Settings
#-----------------------------------------------------------------------------
debug = False
dataset = 'freesurf_combined'
resamplefactor = 1
save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess')
raw = 'False'
analysis = 'uniform'
project_wd, project_data, project_sink = get_paths(debug, dataset)

demographics, imgs, dataframe = get_data(project_data,
                                         dataset,
                                         debug,
                                         project_wd,
                                         resamplefactor,
                                         raw=str_to_bool(raw),
                                         analysis=analysis)

# transform age into ints
demographics['age_int'] = demographics['age'].astype('int32', copy=False)

# Select 14 subjects for all ages that have 14 representatives.
age_range = np.arange(demographics['age'].min(), demographics['age'].max())
# remove entry where you don't have 14 subjects
max_n = 14
age_to_remove = [35, 36, 39, 42, 78, 79, 80, 81, 82, 83, 85, 89]
age_range = np.setdiff1d(age_range, age_to_remove)
# iterate over the dataframe and select 14 subjects for each age range
ids_to_use = []
Ejemplo n.º 5
0
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct

from BayOptPy.helperfunctions import get_data, get_paths

debug = False
dataset = 'BANC'
resamplefactor = 1

random_seed = 42

project_wd, project_data, project_sink = get_paths(debug, dataset)
demographics, imgs, data = get_data(project_data, dataset, debug, project_wd,
                                    resamplefactor)

# Get the fsl data, concatenate GM and WM. For a start use only the WM
targetAttribute = np.array(demographics['Age'])

# Train the model
kernel = DotProduct(sigma_0=0)
gp2 = GaussianProcessRegressor(kernel=kernel, normalize_y=False)
cv_results2 = cross_validate(gp2,
                             data,
                             targetAttribute,
                             scoring='neg_mean_absolute_error',
                             cv=10,
                             n_jobs=4)
# Do cross-validation
print('The MAE are:')
Ejemplo n.º 6
0
This script tests the best model recommened by the combined dataset (UKBIO +
BANC) for 100 generations, random
seed 20, initial population 1000, mutation rate and cross-validation rate 0.9
and cross-over 0.1
"""

# General Settings
#-------------------------------------------------------------------------------
debug = False
resamplefactor = 1
random_seed = 20
save_path = '/code/BayOptPy/tpot/Output/random_seed/100_generations/random_seed_%03d/' %(random_seed)
# Load the combined dataset
project_wd, project_data, _ = get_paths(debug, 'freesurf_combined')
demographics, _, df_data =  \
             get_data(project_data, 'freesurf_combined', debug,
                      project_wd, resamplefactor, raw=False, analysis=None)
# Drop the last column that corresponds the name of the dataset
df_data = df_data.drop('dataset', axis=1)
#-------------------------------------------------------------------------------
# Train the model with BANC
#-------------------------------------------------------------------------------
targetAttribute = demographics[['age']]
demographics = demographics.set_index('id')

# Add a few of the BIOBANK Dataset into the training set
Xtrain, Xtemp, Ytrain, Ytemp = train_test_split(df_data, targetAttribute,
                                                test_size=.90,
                                                stratify=demographics['stratify'],
                                                random_state=random_seed)
train_demographics = demographics.loc[Xtemp.index]
Xvalidate, Xtest, Yvalidate, Ytest = train_test_split(Xtemp, Ytemp,