def tpot_model_analysis(random_seed, save_path): save_path = os.path.join(save_path, 'random_seed_%03d' %random_seed) print('Random seed: %03d' %random_seed) # Load the clean data for both the UKBIO and the BANC analysis # This version of the UKBIOBANK dataset contains the same columns as the BANC # dataset # Load the saved trained model tpot = joblib.load(os.path.join(save_path, 'tpot_%s_vanilla_combi_%03dgen.dump' %(args.dataset, args.generations))) exported_pipeline = tpot['fitted_pipeline'] # Load the saved test dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(args.debug, args.dataset) with open(os.path.join(save_path, 'splitted_dataset_%s.pickle' %args.dataset), 'rb') as handle: splitted_dataset = pickle.load(handle) # Print some results print('Print MAE - test') y_predicted_test = exported_pipeline.predict(splitted_dataset['Xtest_scaled']) mae_test = mean_absolute_error(splitted_dataset['Ytest'], y_predicted_test) print(mae_test) print('Print MAE - training') y_predicted_train = exported_pipeline.predict(splitted_dataset['Xtrain_scaled']) mae_train = mean_absolute_error(splitted_dataset['Ytrain'], y_predicted_train) print(mae_train) # plot predicted vs true for the test output_path_test = os.path.join(save_path, 'test_predicted_true_age.eps') plot_predicted_vs_true(splitted_dataset['Ytest'], y_predicted_test, output_path_test, 'Age') # Do some statistics. Calculate R2 and the Spearman from scipy.stats import spearmanr, pearsonr from sklearn.metrics import r2_score rho_test, rho_p_value_test = spearmanr(splitted_dataset['Ytest'], y_predicted_test) print('Statistics for the test dataset') print('shape of the dataset: %s' %(splitted_dataset['Ytest'].shape,)) print('Rho and p-value: %.4f %.4f' %(rho_test, rho_p_value_test)) r_score_test = r2_score(splitted_dataset['Ytest'], y_predicted_test) print('R2 is: %.4f' %r_score_test) r_test, r_p_value_test = pearsonr(splitted_dataset['Ytest'], y_predicted_test) print('R is: %.4f' %r_test) return mae_test, r_test
def rvc_analysis(random_seed, save_path): # Load the data # TODO: change the path save_path = os.path.join(save_path, 'random_seed_%03d' %random_seed) print('Random seed: %03d' %random_seed) # Load the saved validation dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset) with open(os.path.join(save_path, 'splitted_dataset_%s.pickle' %dataset), 'rb') as handle: splitted_dataset = pickle.load(handle) # Train the model model = RVC(kernel='linear') model.fit(splitted_dataset['Xtrain_scaled'], splitted_dataset['Ytrain']) # make cross validated predictions print('Perform prediction in test data') y_prediction_test = model.predict(splitted_dataset['Xtest_scaled']) y_prediction_validation = model.predict(splitted_dataset['Xvalidate_scaled']) # ----------------------------------------------------------------------------- # Do some statistics. Calculate the confusion matrix # Test dataset # Look at the confusion matrix for test data class_name = np.array(['young', 'old', 'adult'], dtype='U10') ax, cm_test = plot_confusion_matrix(splitted_dataset['Ytest'], y_prediction_test, classes=class_name, normalize=True) # Look at accuracy accuracy_test = accuracy_score(splitted_dataset['Ytest'], y_prediction_test) plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps')) # Predict on the validation dataset ax, cm_validation = plot_confusion_matrix(splitted_dataset['Yvalidate'], y_prediction_validation, classes=class_name, normalize=True) plt.savefig(os.path.join(save_path, 'confusion_matrix_validation_rvc.eps')) # Look at accuracy accuracy_val = accuracy_score(splitted_dataset['Yvalidate'], y_prediction_validation) plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps')) return cm_test, cm_validation, accuracy_test, accuracy_val
'MaskVol.to.eTIV', 'EstimatedTotalIntraMaCranialVol' ] # Check if you have all the features form the banc dataset assert (len(biobank_columns) == len(freesurfer_banc_columns)) renameCols = dict(zip(biobank_columns, freesurfer_banc_columns)) return lh_thickness, rh_thickness, renameCols # Load both datasets debug = False resamplefactor = 1 save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess') project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, 'UKBIO_freesurf') project_banc_wd, project_data_banc, _ = get_paths(debug, 'BANC_freesurf') _, _, freesurfer_df_banc = get_data(project_data_banc, 'BANC_freesurf', debug, project_banc_wd, resamplefactor, raw=True, analysis=None) _, _, freesurfer_df_ukbio = get_data(project_data_ukbio, 'UKBIO_freesurf', debug, project_ukbio_wd, resamplefactor, raw=True, analysis=None)
def rvr_analysis(random_seed, save_path, n_folds, analysis): save_path = save_path / ('random_seed_%03d' % random_seed) print('Random seed: %03d' % random_seed) # Load the saved validation dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset) with open(save_path / ('splitted_dataset_%s.pickle' % dataset), 'rb') as handle: splitted_dataset = pickle.load(handle) kf = KFold(n_splits=n_folds, random_state=random_seed) mae_cv = np.zeros((n_folds, 1)) pearsons_corr = np.zeros((n_folds, 1)) pearsons_pval = np.zeros((n_folds, 1)) # Set target and features x = splitted_dataset['Xtest_scaled'] y = splitted_dataset['Ytest'] t_time_train = [] t_time_test = [] for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)): x_train, x_test = x[train_idx, :], x[test_idx, :] y_train, y_test = y[train_idx], y[test_idx] print('CV iteration: %d' % (i_fold + 1)) print('Shape of the trainig and test dataset') print(y_train.shape, y_test.shape) # train the model model = RVR(kernel='linear') cv_time_train = time.process_time() model.fit(x_train, y_train) elapsed_time = time.process_time() - cv_time_train print('CV - Elapased time in seconds to train:') t_time_train.append(elapsed_time) print('%.03f' % elapsed_time) # test the model cv_time_test = time.process_time() y_predicted = model.predict(x_test) elapsed_time = time.process_time() - cv_time_test t_time_test.append(elapsed_time) print('CV - Elapased time in seconds to test:') print('%.03f' % elapsed_time) mae_kfold = mean_absolute_error(y_test, y_predicted) mae_cv[i_fold, :] = mae_kfold # now look at the pearson's correlation r_test, r_p_value_test = pearsonr(y_test, y_predicted) pearsons_corr[i_fold, :] = r_test pearsons_pval[i_fold, :] = r_p_value_test print('CV results') print('MAE: Mean(SD) = %.3f(%.3f)' % (mae_cv.mean(), mae_cv.std())) print('Pearson\'s Correlation: Mean(SD) = %.3f(%.3f)' % (r_test.mean(), r_test.std())) print('Mean CV time: %.3f s ' % np.mean(t_time_train)) print('SD CV time: %.3f s' % np.std(t_time_train)) print('Mean CV time: %.3f s ' % np.mean(t_time_test)) print('SD CV time: %.3f s' % np.std(t_time_test)) print('') if analysis == 'vanilla_combi': # Train the entire dataset x_train_all, x_test_all, y_train_all, y_test_all = \ train_test_split(x, y, test_size=.85, random_state=random_seed) print('All: Shape of the trainig and test dataset') print(y_train_all.shape, y_test_all.shape) elif analysis == 'uniform_dist': # Train the entire dataset x_train_all, x_test_all, y_train_all, y_test_all = \ train_test_split(x, y, test_size=.20, random_state=random_seed) print('ALL: Shape of the trainig and test dataset') print(y_train_all.shape, y_test_all.shape) print('Training RVR model:') model_all = RVR(kernel='linear') model_all.fit(x_train_all, y_train_all) # plot predicted vs true for the test (Entire sample) print('Plotting Predicted Vs True Age for all the sample') y_predicted_test = model.predict(x_test_all) output_path_test = save_path / ( 'rvr_test_predicted_true_age_rnd_seed%d.eps' % random_seed) plot_predicted_vs_true(y_test_all, y_predicted_test, output_path_test, 'Age') return mae_cv, r_test, t_time_train, t_time_test
from BayOptPy.tpot.dicts.gpr_tpot_config_feat_selec import tpot_config_gpr tpot_config = tpot_config_gpr elif args.config_dict == 'feat_combi': # Load models for feature combination from BayOptPy.tpot.dicts.gpr_tpot_config_feat_combi import tpot_config_gpr tpot_config = tpot_config_gpr elif args.config_dict == 'vanilla_combi': # Load models for feature combination from BayOptPy.tpot.dicts.gpr_tpot_config_vanilla_combi import tpot_config_gpr tpot_config = tpot_config_gpr print('-----------------------------------------------------------------') print('Get datapaths:') print('-----------------------------------------------------------------') # Get data paths, the actual data and check if the output paths exists project_wd, project_data, project_sink = get_paths(args.debug, args.dataset) output_path = get_output_path(args.model, args.analysis, args.generations, args.random_seed, args.population_size, args.debug, args.mutation_rate, args.crossover_rate, args.predicted_attribute) # Load the already cleaned dataset demographics, imgs, dataframe = get_data(project_data, args.dataset, args.debug, project_wd, args.resamplefactor, raw=str_to_bool(args.raw), analysis=args.analysis) print('Using %d features' % len(dataframe.columns))
dataset = 'freesurf_combined' debug = False resamplefactor = 1 random_seed = 20 save_path = '/code/BayOptPy/tpot/Output/vanilla_combi/100_generations/random_seed_020/' # Load the clean data for both the UKBIO and the BANC analysis # This version of the UKBIOBANK dataset contains the same columns as the BANC # dataset # Load the saved trained model tpot = joblib.load( os.path.join(save_path, 'tpot_%s_vanilla_combi_100gen.dump' % dataset)) exported_pipeline = tpot['fitted_pipeline'] # Load the saved validation dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset) with open(os.path.join(save_path, 'splitted_dataset.pickle'), 'rb') as handle: splitted_dataset = pickle.load(handle) # Print some results print('Print MAE - test') y_predicted_test = exported_pipeline.predict(splitted_dataset['Xtest_scaled']) mae = mean_absolute_error(splitted_dataset['Ytest'], y_predicted_test) print(mae) print('Print MAE - training') y_predicted_train = exported_pipeline.predict( splitted_dataset['Xtrain_scaled']) mae_train = mean_absolute_error(splitted_dataset['Ytrain'], y_predicted_train) print(mae_train) print('Print MAE - validation') y_predicted_validation = exported_pipeline.predict(
if s == 'True': return True elif s == 'False': return False #----------------------------------------------------------------------------- # Settings #----------------------------------------------------------------------------- debug = False dataset = 'freesurf_combined' resamplefactor = 1 save_path = os.path.join('/code/BayOptPy', 'freesurfer_preprocess') raw = 'False' analysis = 'uniform' project_wd, project_data, project_sink = get_paths(debug, dataset) demographics, imgs, dataframe = get_data(project_data, dataset, debug, project_wd, resamplefactor, raw=str_to_bool(raw), analysis=analysis) # transform age into ints demographics['age_int'] = demographics['age'].astype('int32', copy=False) # Select 14 subjects for all ages that have 14 representatives. age_range = np.arange(demographics['age'].min(), demographics['age'].max()) # remove entry where you don't have 14 subjects
cwd = os.getcwd() best_pipe_paths = os.path.join(cwd, 'BayOptPy/tpot') # create a directory where to cache the results tpot = ExtendedTPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42, config_dict=tpot_config, periodic_checkpoint_folder=best_pipe_paths, scoring=scoring ) tpot.fit(X_train, y_train, X_test, y_test) print('Test score using optimal model: %f ' %tpot.score(X_test, y_test)) # get paths of where to save the files project_wd, _, _ = get_paths(args.debug, dataset) tpot.export(os.path.join(project_wd, 'BayOptPy', 'tpot', 'debug', 'tpot_boston_pipeline_super.py')) # Do some preprocessing to find models where all predictions have the same value and eliminate them, as those will correspond # to NaN entries or very small numbers on the correlation matrix. repeated_idx = np.argwhere([np.array_equal(np.repeat(tpot.predictions[i][0], len(tpot.predictions[i])), tpot.predictions[i]) for i in range(len(tpot.predictions))]) print('Index of the models with the same prediction for all subjects: ' + str(np.squeeze(repeated_idx))) print('Number of models analysed: %d' %len(tpot.predictions)) tpot_predictions = np.delete(np.array(tpot.predictions), np.squeeze(repeated_idx), axis=0) print('Number of models that will be used for cross-correlation: %s' %(tpot_predictions.shape,)) # Cross correlate the predictions corr_matrix = np.corrcoef(tpot_predictions) print('Check the number of NaNs after deleting models with constant predictions: %d' %len(np.argwhere(np.isnan(corr_matrix))))
""" This script tests the best model recommened by the combined dataset (UKBIO + BANC) for 100 generations, random seed 20, initial population 1000, mutation rate and cross-validation rate 0.9 and cross-over 0.1 """ # General Settings #------------------------------------------------------------------------------- debug = False resamplefactor = 1 random_seed = 20 save_path = '/code/BayOptPy/tpot/Output/random_seed/100_generations/random_seed_%03d/' %(random_seed) # Load the combined dataset project_wd, project_data, _ = get_paths(debug, 'freesurf_combined') demographics, _, df_data = \ get_data(project_data, 'freesurf_combined', debug, project_wd, resamplefactor, raw=False, analysis=None) # Drop the last column that corresponds the name of the dataset df_data = df_data.drop('dataset', axis=1) #------------------------------------------------------------------------------- # Train the model with BANC #------------------------------------------------------------------------------- targetAttribute = demographics[['age']] demographics = demographics.set_index('id') # Add a few of the BIOBANK Dataset into the training set Xtrain, Xtemp, Ytrain, Ytemp = train_test_split(df_data, targetAttribute, test_size=.90, stratify=demographics['stratify'],