renameCols = dict(zip(biobank_columns, freesurfer_banc_columns)) return lh_thickness, rh_thickness, renameCols # Load both datasets debug = False resamplefactor = 1 save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess') project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf') project_banc_wd, project_data_banc, _ = get_paths(debug, 'BANC_freesurf') _, _, freesurfer_df_banc = get_data(project_data_banc, 'BANC_freesurf', debug, project_banc_wd, resamplefactor, raw=True, analysis=None) _, _, freesurfer_df_ukbio = get_data(project_data_ukbio, 'UKBIO_freesurf', debug, project_ukbio_wd, resamplefactor, raw=True, analysis=None) # checM`k the columns between both datasets # First Maprint the size of dataset print('shape of the banc dataset; shape of the ukbio dataset') print(freesurfer_df_banc.shape, freesurfer_df_ukbio.shape)
seed 20, initial population 1000, mutation rate and cross-validation ratexxx """ set_publication_style() # General Settings #------------------------------------------------------------------------------- debug = False resamplefactor = 1 random_seed = 20 save_path = '/code/BayOptPy/' # Load the clean data for both the UKBIO and the BANC analysis # This version of the UKBIOBANK dataset contains the same columns as the BANC # dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf') _, _, df_ukbio = \ get_data(project_data_ukbio, 'UKBIO_freesurf', debug, project_ukbio_wd, resamplefactor, raw=False, analysis=None) df_ukbio = df_ukbio.set_index('id') # Drop the last column that corresponds the name of the dataset df_ukbio = df_ukbio.drop('dataset', axis=1) project_banc_wd, project_banc_data, _ = get_paths(debug, 'BANC_freesurf') demographics_banc, __, df_banc = get_data(project_banc_data, 'BANC_freesurf', debug, project_banc_wd, resamplefactor, raw=False, analysis=None) # Drop the last column that corresponds the name of the dataset df_banc = df_banc.drop('dataset', axis=1)
print('-----------------------------------------------------------------') print('Get datapaths:') print('-----------------------------------------------------------------') # Get data paths, the actual data and check if the output paths exists project_wd, project_data, project_sink = get_paths(args.debug, args.dataset) output_path = get_output_path(args.model, args.analysis, args.generations, args.random_seed, args.population_size, args.debug, args.mutation_rate, args.crossover_rate, args.predicted_attribute) # Load the already cleaned dataset demographics, imgs, dataframe = get_data(project_data, args.dataset, args.debug, project_wd, args.resamplefactor, raw=str_to_bool(args.raw), analysis=args.analysis) print('Using %d features' % len(dataframe.columns)) # If we are looking at the uniform distribution, get the corresponding # dataset if args.analysis == 'uniform_dist': demographics, dataframe = get_uniform_dist_data( args.debug, args.dataset, args.resamplefactor, str_to_bool(args.raw), args.analysis) if args.dataset == 'freesurf_combined' or args.dataset == 'UKBIO_freesurf': # Drop the last coumn which correspond to the dataset name dataframe = dataframe.drop(['dataset'], axis=1)
#----------------------------------------------------------------------------- # Settings #----------------------------------------------------------------------------- debug = False dataset = 'freesurf_combined' resamplefactor = 1 save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess') raw = 'False' analysis = 'uniform' project_wd, project_data, project_sink = get_paths(debug, dataset) demographics, imgs, dataframe = get_data(project_data, dataset, debug, project_wd, resamplefactor, raw=str_to_bool(raw), analysis=analysis) # transform age into ints demographics['age_int'] = demographics['age'].astype('int32', copy=False) # Select 14 subjects for all ages that have 14 representatives. age_range = np.arange(demographics['age'].min(), demographics['age'].max()) # remove entry where you don't have 14 subjects max_n = 14 age_to_remove = [35, 36, 39, 42, 78, 79, 80, 81, 82, 83, 85, 89] age_range = np.setdiff1d(age_range, age_to_remove) # iterate over the dataframe and select 14 subjects for each age range ids_to_use = []
import numpy as np from sklearn.model_selection import train_test_split, cross_validate from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct from BayOptPy.helperfunctions import get_data, get_paths debug = False dataset = 'BANC' resamplefactor = 1 random_seed = 42 project_wd, project_data, project_sink = get_paths(debug, dataset) demographics, imgs, data = get_data(project_data, dataset, debug, project_wd, resamplefactor) # Get the fsl data, concatenate GM and WM. For a start use only the WM targetAttribute = np.array(demographics['Age']) # Train the model kernel = DotProduct(sigma_0=0) gp2 = GaussianProcessRegressor(kernel=kernel, normalize_y=False) cv_results2 = cross_validate(gp2, data, targetAttribute, scoring='neg_mean_absolute_error', cv=10, n_jobs=4) # Do cross-validation print('The MAE are:')
This script tests the best model recommened by the combined dataset (UKBIO + BANC) for 100 generations, random seed 20, initial population 1000, mutation rate and cross-validation rate 0.9 and cross-over 0.1 """ # General Settings #------------------------------------------------------------------------------- debug = False resamplefactor = 1 random_seed = 20 save_path = '/code/BayOptPy/tpot/Output/random_seed/100_generations/random_seed_%03d/' %(random_seed) # Load the combined dataset project_wd, project_data, _ = get_paths(debug, 'freesurf_combined') demographics, _, df_data = \ get_data(project_data, 'freesurf_combined', debug, project_wd, resamplefactor, raw=False, analysis=None) # Drop the last column that corresponds the name of the dataset df_data = df_data.drop('dataset', axis=1) #------------------------------------------------------------------------------- # Train the model with BANC #------------------------------------------------------------------------------- targetAttribute = demographics[['age']] demographics = demographics.set_index('id') # Add a few of the BIOBANK Dataset into the training set Xtrain, Xtemp, Ytrain, Ytemp = train_test_split(df_data, targetAttribute, test_size=.90, stratify=demographics['stratify'], random_state=random_seed) train_demographics = demographics.loc[Xtemp.index] Xvalidate, Xtest, Yvalidate, Ytest = train_test_split(Xtemp, Ytemp,