log_dir = make_and_get_dir(results_dir, 'log') fitting_dir = make_and_get_dir(results_dir, 'model_fitting') mv_fitting_dir = make_and_get_dir(fitting_dir, 'multi_view') model_sel_dir = make_and_get_dir(results_dir, 'model_selection') # bd_sel_dir = make_and_get_dir(model_sel_dir, 'bd_mvmm') # log_sel_dir = make_and_get_dir(model_sel_dir, 'log_pen_mvmm') bd_sel_dir = make_and_get_dir(model_sel_dir) log_sel_dir = make_and_get_dir(model_sel_dir) opt_diag_dir = make_and_get_dir(results_dir, 'opt_diagnostics') res_writer = ResultsWriter(os.path.join(log_dir, 'mvmm_model_selection.txt'), delete_if_exists=True) res_writer.write('user_bd_mvmm_best_idx: {}'. format(args.user_bd_mvmm_best_idx)) res_writer.write('user_log_pen_mvmm_best_idx: {}'. format(args.user_log_pen_mvmm_best_idx)) res_writer.write('select_metric: {}'.format(args.select_metric)) # data = load(os.path.join(save_dir, 'multi_view_fit_data')) ############# # load data # ############# mvmm_results = load_fitted_mvmms(fitting_dir) dataset_names = mvmm_results['dataset_names'] # possilby load existing selected models sel_models_fpath = os.path.join(fitting_dir, 'selected_models')
parser = add_parsers(parser, to_add=[general_opt_parser, base_gmm_parser]) parser = bayes_parser(parser) args = parser.parse_args() bayes_submit(args) args = format_mini_experiment(args) results_dir = args.results_dir log_dir = make_and_get_dir(results_dir, 'log') fitting_dir = make_and_get_dir(results_dir, 'model_fitting', 'single_view') res_writer = ResultsWriter(join(log_dir, 'single_view_fitting.txt'), delete_if_exists=True) res_writer.write(args) run_start_time = time() n_views = len(args.fpaths) ############# # load data # ############# view_data, dataset_names, sample_names, feat_names = \ load_data(*args.fpaths) for v in range(n_views): res_writer.write('{} (view {}) shape : {}'.format(dataset_names[v], v,
# else: # rank_sel_kws = {'n_components': 'rmt_threshold', # 'rank_sel_kws': {'thresh_method': 'dg'}} # rank_sel_kws = {'n_components': 'bai_ng_bic', # 'rank_sel_kws': {'who': 1}, # 'max_rank': 400 # } for k in data.keys(): res_writer = ResultsWriter(fpath=join(diagnostics_dir, '{}_log.txt'.format(k)), delete_if_exists=True) res_writer.write(k) res_writer.write('shape {}'.format(data[k].shape)) rank_sel_kws = {'n_components': 'rmt_threshold', 'rank_sel_kws': {'thresh_method': 'dg'}} pca = PCA(**rank_sel_kws) start_time = time() pca.fit(data[k].values) runtime = time() - start_time res_writer.write("computed pca took {:1.2f} seconds".format(runtime)) res_writer.write('Estimated n components {}'.format(pca.n_components_)) ################### # save fitted PCA #
parser = bayes_parser(parser) args = parser.parse_args() args = format_mini_experiment(args) args.job_name = args.sim_name bayes_submit(args) if args.sim_name is None: args.sim_name = 'meow' save_dir = make_and_get_dir(Paths().results_dir, 'single', args.sim_name) res_writer = ResultsWriter(os.path.join(save_dir, 'results.txt'), delete_if_exists=True) res_writer.write('\n\n\n Input args') res_writer.write(args) rng = check_random_state(args.metaseed) to_exclude = [] # if args.exclude_sp_mvmm: to_exclude.append('sp_mvmm') if args.exclude_bd_mvmm: to_exclude.append('bd_mvmm') if args.exclude_log_pen_mvmm: to_exclude.append('log_pen_mvmm') inches = 8 ##############
def run_sim(models, data_dist, Pi, view_params, n_samples_tr, data_seed, n_samples_tst=2000, zero_thresh=0, reg_covar_mult=1e-2, mc_index=None, to_exclude=None, log_fpath=None): """ Parameters ---------- models data_dist: callable(n_samples, seed) Function to generate data. n_samples_tr: int Number of training samples. data_seed: int Seed for sampling train/test observations. n_samples_tst: int Number of samples to get for test data. """ res_writer = ResultsWriter(log_fpath, delete_if_exists=True) res_writer.write("Beginning simulation at {}".format(get_current_time)) overall_start_time = time() seeds = get_seeds(random_state=data_seed, n_seeds=2) # sample data X_tr, Y_tr = data_dist(n_samples=n_samples_tr, random_state=seeds[0]) X_tst, Y_tst = data_dist(n_samples=n_samples_tst, random_state=seeds[1]) n_views = len(X_tr) Pi_empirical = get_empirical_pi(Y_tr, Pi.shape, scale='counts') runtimes = {} if to_exclude is None: to_exclude = [] for m in to_exclude: assert m in ['bd_mvmm', 'sp_mvmm', 'log_pen_mvmm'] ############################# # covariance regularization # ############################# n_views = len(X_tr) reg_covar = {} # set cov reg for each view for v in range(n_views): reg = default_cov_regularization(X=X_tr[v], mult=reg_covar_mult) models['view_gmms'][v].base_estimator.set_params(reg_covar=reg) models['full_mvmm'].base_view_models[v].set_params(reg_covar=reg) models['bd_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['bd_mvmm'].base_estimator.base_final.base_view_models[v].\ set_params(reg_covar=reg) models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['sp_mvmm'].base_mvmm_0.base_view_models[v].\ set_params(reg_covar=reg) models['sp_mvmm'].base_wbd_mvmm.base_view_models[v].\ set_params(reg_covar=reg) # print and save reg_covar[v] = reg res_writer.write( "\nCovarinace regularization for view {} is {}".format(v, reg)) stds = X_tr[v].std(axis=0) res_writer.write("Smallest variance: {}".format(stds.min()**2)) res_writer.write("Largest variance: {}".format(stds.max()**2)) # for cat GMM reg = default_cov_regularization(X=np.hstack(X_tr), mult=reg_covar_mult) models['cat_gmm'].base_estimator.set_params(reg_covar=reg) reg_covar['cat_gmm'] = reg ############## # fit models # ############## # get classification resuls clf_results = {} start_time = time() clf_results['cat'] = clf_fit_and_score(clone(models['clf']), X_tr=np.hstack(X_tr), y_tr=view_labs_to_overall(Y_tr), X_tst=np.hstack(X_tst), y_tst=view_labs_to_overall(Y_tst)) runtimes['cat'] = time() - start_time for v in range(n_views): start_time = time() clf_results['view_{}'.format(v)] =\ clf_fit_and_score(clone(models['clf']), X_tr=X_tr[v], y_tr=Y_tr[:, v], X_tst=X_tst[v], y_tst=Y_tst[:, v]) runtimes['clf_view_{}'.format(v)] = time() - start_time # fit clustering simplefilter('ignore', ConvergenceWarning) results_df = pd.DataFrame() sim_stub = {'mc_index': mc_index, 'n_samples': n_samples_tr} dists_cat = pairwise_distances(X=np.hstack(X_tr)) dists_views = [pairwise_distances(X=X_tr[v]) for v in range(n_views)] kws = { 'sim_stub': sim_stub, 'X_tr': X_tr, 'Y_tr': Y_tr, 'X_tst': X_tst, 'Y_tst': Y_tst, 'Pi_true': Pi, 'view_params_true': view_params, 'zero_thresh': zero_thresh, } ########### # cat-GMM # ########### # print('start fitting cat-GMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['cat_gmm'].fit(np.hstack(X_tr)) runtimes['cat_gmm'] = time() - start_time res_writer.write('fitting grid search cat-GMM took {:1.2f} seconds'.format( runtimes['cat_gmm'])) results_df = add_gs_results(results_df=results_df, model=models['cat_gmm'], model_name='gmm_cat', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) ############# # View GMMs # ############# # print('start fitting view marginal GMMs at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) for v in range(n_views): start_time = time() models['view_gmms'][v].fit(X_tr[v]) runtimes['gmm_view_{}'.format(v)] = time() - start_time res_writer.write( 'fitting marginal view {} GMM took {:1.2f} seconds'.format( v, runtimes['gmm_view_{}'.format(v)])) # gmm fit on this view results_df = add_gs_results(results_df=results_df, model=models['view_gmms'][v], model_name='marginal_view_{}'.format(v), dataset='view', view=v, X_tr_precomp_dists=dists_views[v], **kws) ############# # Full MVMM # ############# # print('start fitting full MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['full_mvmm'].fit(X_tr) runtimes['full_mvmm'] = time() - start_time res_writer.write('fitting full mvmm took {:1.2f} seconds'.format( runtimes['full_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['full_mvmm'], model_name='full_mvmm', run_biptsp_on_full=True, dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['full_mvmm'], model_name='full_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[v], # TODO is this what we want **kws) ################ # log pen MVMM # ################ if 'log_pen_mvmm' not in to_exclude: # print('start fitting log pen grid search MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['log_pen_mvmm'].fit(X_tr) runtimes['log_pen_mvmm'] = time() - start_time res_writer.write('fitting grid search for log pen' 'mvmm took {:1.2f} seconds'.format( runtimes['log_pen_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['log_pen_mvmm'], model_name='log_pen_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add log pen MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['log_pen_mvmm'], model_name='log_pen_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ####################### # block diagonal MVMM # ####################### if 'bd_mvmm' not in to_exclude: # print('start fitting block diag grid search MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['bd_mvmm'].fit(X_tr) runtimes['bd_mvmm'] = time() - start_time res_writer.write('fitting grid search for block' 'diag mvmm took {:1.2f} seconds'.format( runtimes['bd_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['bd_mvmm'], model_name='bd_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add bd MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['bd_mvmm'], model_name='bd_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ######################### # spectral penalty MVMM # ######################### if 'sp_mvmm' not in to_exclude: # print('start fitting spectral penalty MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['sp_mvmm'].fit(X_tr) runtimes['sp_mvmm'] = time() - start_time res_writer.write('fitting grid search for spect pen' 'mvmm took {:1.2f} seconds'.format( runtimes['sp_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['sp_mvmm'], model_name='sp_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add sp MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['sp_mvmm'], model_name='sp_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ########## # oracle # ########## results_df = add_gs_results( results_df, model=models['oracle'], model_name='oracle', run_biptsp_on_full=False, # or true? dataset='full', view='both', **kws) # Some formatting # ensure these columns are saved as integers int_cols = [ 'mc_index', 'best_tuning_idx', 'n_comp_est', 'n_comp_resid', 'n_comp_tot_est', 'n_samples', 'tune_idx' ] results_df[int_cols] = results_df[int_cols].astype(int) # block diagonal summary of Pi estimates bd_summary = {} if 'log_pen_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'log_pen_mvmm'}) bd_summary['log_pen_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['log_pen_mvmm'], zero_thresh=zero_thresh) if 'bd_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'bd_mvmm'}) bd_summary['bd_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['bd_mvmm'], zero_thresh=zero_thresh) if 'sp_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'sp_mvmm'}) bd_summary['sp_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['sp_mvmm'], zero_thresh=zero_thresh) res_writer.write( "Entire simulation took {:1.2f} seconds".format(time() - overall_start_time)) tr_data = {'X_tr': X_tr, 'Y_tr': Y_tr} return results_df, clf_results, models, bd_summary, Pi_empirical, tr_data,\ runtimes
filter_kws = { 'tumor_sample_only': True, 'primary_tumor_only': True, 'keep_first_of_participant_multiples': True, 'ensure_participant_idx': True, 'verbose': True } raw_data_dir = TCGAPaths().raw_data_dir pro_data_dir = make_and_get_dir(TCGAPaths().pro_data_dir, cancer_type) feat_save_dir = make_and_get_dir(pro_data_dir, feat_list) res_writer = ResultsWriter(join(feat_save_dir, 'log.txt'), delete_if_exists=True) res_writer.write(args) fnames = { 'mi_rna': 'pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv', 'rna': 'EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv', 'dna_meth': 'jhu-usc.edu_PANCAN_merged_HumanMethylation27_HumanMethylation450.betaValue_whitelisted.tsv', 'cp': 'all_data_by_genes_whitelisted.tsv' } start_time = time() ################## # load metadata # ##################
# stub = 'mvmm_fitting_{}_{}'.format(args.n_comp_v0, args.n_comp_v1) stub = 'mvmm_fitting' for nc in args.n_view_comps: stub += '_{}'.format(nc) results_dir = args.results_dir # results_dir = make_and_get_dir(args.top_dir, args.sim_name) log_dir = make_and_get_dir(results_dir, 'log') fitting_dir = make_and_get_dir(results_dir, 'model_fitting') model_sel_dir = make_and_get_dir(results_dir, 'model_selection') opt_diag_dir = make_and_get_dir(results_dir, 'opt_diagnostics') res_writer = ResultsWriter(os.path.join(log_dir, '{}.txt'.format(stub)), delete_if_exists=True) res_writer.write(args) run_start_time = time() to_exclude = [] # if args.exclude_sp_mvmm: # to_exclude.append('sp_mvmm') if args.exclude_bd_mvmm: to_exclude.append('bd_mvmm') if args.exclude_log_pen_mvmm: to_exclude.append('log_pen_mvmm') ############# # load data # ############# n_views = len(args.fpaths)
sel_models_fpath = os.path.join(fitting_dir, 'selected_models') sel_models = {} ########### # cat gmm # ########### # if 'cat_gmm' in results['models'].keys(): if cat_model is not None: if cat_model.check_fit(): estimator = cat_model.best_estimator_ sel_models['cat_gmm'] = estimator # model selection est_n_comp = estimator.n_components res_writer.write("Cat data GMM estimated number of components: {}". format(est_n_comp)) # _model_sel_dir = make_and_get_dir(model_sel_dir, 'cat_gmm') # plt.figure(figsize=(inches, inches)) plot_model_selection(cat_model, save_dir=model_sel_dir, name_stub='cat_gmm', title='GMM on concatenated data', inches=inches) # optimization history loss_history = estimator.opt_data_['history']['obs_nll'] plot_loss_history(loss_history, loss_name='Observed data negative log-likelihood') save_fig(os.path.join(opt_diag_dir, 'cat_best_model_opt_history.png')) else: