"""Boiler plate function that has to be put in every multiple experiment script, as exp does not pickle.""" exp.run_command( 'print_config', config_updates=config_updates, ) run = exp._create_run(config_updates=config_updates, ) run._id = _id observer = OurFileStorageObserver.create(basedir=output_dir) run.observers.append(observer) run() if __name__ == '__main__': source_dir = join(get_data_dir(), 'reduced_512') data, target = load_data_from_dir(data_dir=source_dir) studies = list(data.keys()) l2_penalties = np.logspace(-4, -1, 20) config_updates = ParameterGrid({ 'logistic.l2_penalty': l2_penalties, 'data.studies': studies }) output_dir = join(get_output_dir(), 'baseline_logistic_icbm_gm') _id = get_id(output_dir) Parallel(n_jobs=40, verbose=100)( delayed(run_exp)(output_dir, config_update, _id=_id + i) for i, config_update in enumerate(config_updates))
else: fontweight = 'normal' ax.annotate(y_labels[estimator], xy=(1, i), xytext=(10, 0), textcoords="offset points", fontweight=fontweight, xycoords=('axes fraction', 'data'), va='center', ha='left') sns.despine(fig) plt.setp(ax.legend(), visible=False) plt.savefig(join(save_dir, 'accuracies.pdf')) plt.close(fig) output_dir = join(get_output_dir(output_dir=None), 'normalized', 'split_by_task') if not os.path.exists(output_dir): os.makedirs(output_dir) save_dir = join(output_dir, 'compare') if not os.path.exists(save_dir): os.makedirs(save_dir) # gather_metrics(output_dir=output_dir, save_dir=save_dir) plot_mean_accuracies(save_dir=save_dir, split_by_task=True) # plot_accuracies(save_dir)
_run.info['n_iter'] = train_callback.n_iter_ _run.info['train_scores'] = train_callback.scores_ _run.info['test_scores'] = test_callback.scores_ estimator.fit(train_data, train_targets, study_weights=study_weights, callback=callback) test_preds = estimator.predict(test_data) test_scores = {} for study in train_targets: test_scores[study] = accuracy_score(test_preds[study]['contrast'], test_targets[study]['contrast']) test_preds = target_encoder.inverse_transform(test_preds) test_targets = target_encoder.inverse_transform(test_targets) for study in test_preds: test_preds[study] = pd.concat([test_preds[study], test_targets[study]], axis=1, keys=['pred', 'true'], names=['target']) save_output(target_encoder, standard_scaler, estimator, test_preds) return test_scores if __name__ == '__main__': output_dir = join(get_output_dir(), 'multi_studies') exp.observers.append(OurFileStorageObserver.create(basedir=output_dir)) exp.run()
def run_exp(output_dir, config_updates, _id): """Boiler plate function that has to be put in every multiple experiment script, as exp does not pickle.""" exp.run_command( 'print_config', config_updates=config_updates, ) # run = exp._create_run(config_updates=config_updates, ) # run._id = _id # observer = OurFileStorageObserver.create(basedir=output_dir) # run.observers.append(observer) # run() if __name__ == '__main__': output_dir = join(get_output_dir(), 'factored') exp.config(factored) config_updates = [] config_updates += list( ParameterGrid({ 'data.studies': [['archi'], ['archi', 'hcp'], ['brainomics'], ['brainomics', 'hcp'], ['camcan'], ['camcan', 'hcp'], ['la5c'], ['la5c', 'hcp']], })) _id = get_id(output_dir) Parallel(n_jobs=1, verbose=100)( delayed(run_exp)(output_dir, config_update, _id=_id + i) for i, config_update in enumerate(config_updates))
plot_stat_map(index_img(this_img, i), figure=fig, title=full_name) plt.savefig(join(output_dir, '%s.png' % full_name)) plt.close(fig) def compute_components(output_dir, lstsq): estimator = load(join(output_dir, 'estimator.pkl')) target_encoder = load(join(output_dir, 'target_encoder.pkl')) standard_scaler = load(join(output_dir, 'standard_scaler.pkl')) modl_atlas = fetch_atlas_modl() dictionary = modl_atlas['components512'] components, names = maps_from_model(estimator, dictionary, target_encoder, standard_scaler, lstsq=lstsq) plot_dir = join(output_dir, 'plot') if not os.path.exists(plot_dir): os.makedirs(plot_dir) dump(names, join(plot_dir, 'names.pkl')) for study, this_components in components.items(): this_components.to_filename( join(plot_dir, 'components_%s.nii.gz' % study)) plot_components(components, names, plot_dir) if __name__ == '__main__': compute_components(join(get_output_dir(), 'multi_studies', '1'), lstsq=True)
def run(estimator='multi_study', seed=0, plot=False, n_jobs=1, use_gpu=False, split_by_task=False, verbose=0): # Parameters system = dict(verbose=verbose, n_jobs=n_jobs, plot=plot, seed=seed, output_dir=None) data = dict( studies='all', dataset='loadings', # Useful to override source directory test_size=0.5, train_size=0.5, reduced=True, data_dir=None, ) model = dict( split_by_task=split_by_task, estimator=estimator, normalize=True, seed=100, target_study=None, ) config = {'system': system, 'data': data, 'model': model} if model['estimator'] in ['multi_study', 'ensemble']: multi_study = dict( latent_size=128, weight_power=0.6, batch_size=128, init='resting-state', latent_dropout=0.75, input_dropout=0.25, device='cuda:0' if use_gpu else 'cpu', seed=100, lr={ 'pretrain': 1e-3, 'train': 1e-3, 'finetune': 1e-3 }, max_iter={ 'pretrain': 200, 'train': 300, 'finetune': 200 }, ) config['multi_study'] = multi_study if model['estimator'] == 'ensemble': ensemble = dict( seed=100, n_runs=120, alpha=1e-5, ) config['ensemble'] = ensemble else: logistic = dict( l2_penalty=np.logspace(-7, 0, 4).tolist(), max_iter=1000, ) config['logistic'] = logistic output_dir = join(get_output_dir(config['system']['output_dir']), 'normalized', 'split_by_task' if split_by_task else 'split_by_study', config['model']['estimator'], str(config['system']['seed'])) if not os.path.exists(output_dir): os.makedirs(output_dir) info = {} with open(join(output_dir, 'config.json'), 'w+') as f: json.dump(config, f) print("Loading data") if data['studies'] == 'all': studies = STUDY_LIST elif isinstance(data['studies'], str): studies = [data['studies']] elif isinstance(data['studies'], list): studies = data['studies'] else: raise ValueError("Studies should be a list or 'all'") if data['dataset'] is not None: input_data, targets = load_from_directory(dataset=data['dataset'], data_dir=data['data_dir']) elif data['reduced']: input_data, targets = load_reduced_loadings(data_dir=data['data_dir']) else: input_data, targets = load_masked_contrasts(data_dir=data['data_dir']) input_data = {study: input_data[study] for study in studies} targets = {study: targets[study] for study in studies} if model['split_by_task']: _, split_targets = split_studies(input_data, targets) target_encoder = MultiTargetEncoder().fit(split_targets) train_data, test_data, train_targets, test_targets = \ train_test_split(input_data, targets, random_state=system['seed'], test_size=data['test_size'], train_size=data['train_size']) if model['split_by_task']: train_data, train_targets = split_studies(train_data, train_targets) test_data, test_targets = split_studies(test_data, test_targets) train_targets = target_encoder.transform(train_targets) test_targets = target_encoder.transform(test_targets) print("Setting up model") if model['normalize']: standard_scaler = MultiStandardScaler().fit(train_data) train_data = standard_scaler.transform(train_data) test_data = standard_scaler.transform(test_data) else: standard_scaler = None if model['estimator'] in ['multi_study', 'ensemble']: estimator = MultiStudyClassifier(verbose=system['verbose'], n_jobs=system['n_jobs'], **multi_study) if model['estimator'] == 'ensemble': memory = Memory(cachedir=None) estimator = EnsembleClassifier(estimator, n_jobs=system['n_jobs'], memory=memory, **ensemble) callback = None else: # Set some callback to obtain useful verbosity test_callback = ScoreCallback(Xs=test_data, ys=test_targets, score_function=accuracy_score) train_callback = ScoreCallback(Xs=train_data, ys=train_targets, score_function=accuracy_score) callback = MultiCallback({ 'train': train_callback, 'test': test_callback }) info['n_iter'] = train_callback.n_iter_ info['train_scores'] = train_callback.scores_ info['test_scores'] = test_callback.scores_ elif model['estimator'] == 'logistic': estimator = MultiLogisticClassifier(verbose=system['verbose'], n_jobs=n_jobs, **logistic) callback = None print("Training model") estimator.fit(train_data, train_targets, callback=callback) print("Evaluating model") test_preds = estimator.predict(test_data) metrics = compute_metrics(test_preds, test_targets, target_encoder) print(metrics['accuracy']) print("Saving model") # Save model for further analysis dump(target_encoder, join(output_dir, 'target_encoder.pkl')) if model['normalize']: dump(standard_scaler, join(output_dir, 'standard_scaler.pkl')) dump(estimator, join(output_dir, 'estimator.pkl')) with open(join(output_dir, 'metrics.json'), 'w+') as f: json.dump(metrics, f) with open(join(output_dir, 'info.json'), 'w+') as f: json.dump(info, f) if config['system']['plot']: from utils.plotting import make_plots, prepare_plots print('Preparing plots') prepare_plots(output_dir) print("Plotting model") plot_components = config['model']['estimator'] in [ 'multi_study', 'ensemble' ] make_plots(output_dir, plot_classifs=True, plot_components=plot_components, plot_surface=False, plot_wordclouds=True, n_jobs=config['system']['n_jobs'])
for i, estimator in enumerate(estimators): if i == len(estimators) - 1: fontweight = 'bold' else: fontweight = 'normal' ax.annotate(y_labels[estimator], xy=(1, i), xytext=(10, 0), textcoords="offset points", fontweight=fontweight, xycoords=('axes fraction', 'data'), va='center', ha='left') sns.despine(fig) plt.setp(ax.legend(), visible=False) plt.savefig(join(save_dir, 'accuracies.pdf')) plt.close(fig) output_dir = get_output_dir(output_dir=None) if not os.path.exists(output_dir): os.makedirs(output_dir) save_dir = join(output_dir, 'compare') if not os.path.exists(save_dir): os.makedirs(save_dir) gather_metrics(output_dir=output_dir, save_dir=save_dir) plot_mean_accuracies(save_dir=save_dir) plot_accuracies(save_dir)