Example #1
0
log_dir = make_and_get_dir(results_dir, 'log')
fitting_dir = make_and_get_dir(results_dir, 'model_fitting')
mv_fitting_dir = make_and_get_dir(fitting_dir, 'multi_view')
model_sel_dir = make_and_get_dir(results_dir, 'model_selection')
# bd_sel_dir = make_and_get_dir(model_sel_dir, 'bd_mvmm')
# log_sel_dir = make_and_get_dir(model_sel_dir, 'log_pen_mvmm')
bd_sel_dir = make_and_get_dir(model_sel_dir)
log_sel_dir = make_and_get_dir(model_sel_dir)
opt_diag_dir = make_and_get_dir(results_dir, 'opt_diagnostics')


res_writer = ResultsWriter(os.path.join(log_dir, 'mvmm_model_selection.txt'),
                           delete_if_exists=True)


res_writer.write('user_bd_mvmm_best_idx: {}'.
                 format(args.user_bd_mvmm_best_idx))
res_writer.write('user_log_pen_mvmm_best_idx: {}'.
                 format(args.user_log_pen_mvmm_best_idx))
res_writer.write('select_metric: {}'.format(args.select_metric))

# data = load(os.path.join(save_dir, 'multi_view_fit_data'))

#############
# load data #
#############

mvmm_results = load_fitted_mvmms(fitting_dir)
dataset_names = mvmm_results['dataset_names']

# possilby load existing selected models
sel_models_fpath = os.path.join(fitting_dir, 'selected_models')
Example #2
0
parser = add_parsers(parser, to_add=[general_opt_parser, base_gmm_parser])

parser = bayes_parser(parser)
args = parser.parse_args()
bayes_submit(args)
args = format_mini_experiment(args)

results_dir = args.results_dir
log_dir = make_and_get_dir(results_dir, 'log')
fitting_dir = make_and_get_dir(results_dir, 'model_fitting', 'single_view')

res_writer = ResultsWriter(join(log_dir, 'single_view_fitting.txt'),
                           delete_if_exists=True)

res_writer.write(args)

run_start_time = time()

n_views = len(args.fpaths)

#############
# load data #
#############

view_data, dataset_names, sample_names, feat_names = \
    load_data(*args.fpaths)

for v in range(n_views):

    res_writer.write('{} (view {}) shape : {}'.format(dataset_names[v], v,
Example #3
0
# else:
#     rank_sel_kws = {'n_components': 'rmt_threshold',
#                     'rank_sel_kws': {'thresh_method': 'dg'}}

# rank_sel_kws = {'n_components': 'bai_ng_bic',
#                 'rank_sel_kws': {'who': 1},
#                 'max_rank': 400
#                 }


for k in data.keys():
    res_writer = ResultsWriter(fpath=join(diagnostics_dir,
                                          '{}_log.txt'.format(k)),
                               delete_if_exists=True)

    res_writer.write(k)
    res_writer.write('shape {}'.format(data[k].shape))

    rank_sel_kws = {'n_components': 'rmt_threshold',
                    'rank_sel_kws': {'thresh_method': 'dg'}}

    pca = PCA(**rank_sel_kws)

    start_time = time()
    pca.fit(data[k].values)
    runtime = time() - start_time
    res_writer.write("computed pca took {:1.2f} seconds".format(runtime))
    res_writer.write('Estimated n components {}'.format(pca.n_components_))

    ###################
    # save fitted PCA #
Example #4
0
parser = bayes_parser(parser)
args = parser.parse_args()
args = format_mini_experiment(args)
args.job_name = args.sim_name

bayes_submit(args)

if args.sim_name is None:
    args.sim_name = 'meow'

save_dir = make_and_get_dir(Paths().results_dir, 'single', args.sim_name)

res_writer = ResultsWriter(os.path.join(save_dir, 'results.txt'),
                           delete_if_exists=True)

res_writer.write('\n\n\n Input args')
res_writer.write(args)

rng = check_random_state(args.metaseed)

to_exclude = []
# if args.exclude_sp_mvmm:
to_exclude.append('sp_mvmm')
if args.exclude_bd_mvmm:
    to_exclude.append('bd_mvmm')
if args.exclude_log_pen_mvmm:
    to_exclude.append('log_pen_mvmm')

inches = 8

##############
Example #5
0
def run_sim(models,
            data_dist,
            Pi,
            view_params,
            n_samples_tr,
            data_seed,
            n_samples_tst=2000,
            zero_thresh=0,
            reg_covar_mult=1e-2,
            mc_index=None,
            to_exclude=None,
            log_fpath=None):
    """

    Parameters
    ----------
    models

    data_dist: callable(n_samples, seed)
        Function to generate data.

    n_samples_tr: int
        Number of training samples.

    data_seed: int
        Seed for sampling train/test observations.

    n_samples_tst: int
        Number of samples to get for test data.

    """

    res_writer = ResultsWriter(log_fpath, delete_if_exists=True)
    res_writer.write("Beginning simulation at {}".format(get_current_time))
    overall_start_time = time()

    seeds = get_seeds(random_state=data_seed, n_seeds=2)

    # sample data
    X_tr, Y_tr = data_dist(n_samples=n_samples_tr, random_state=seeds[0])
    X_tst, Y_tst = data_dist(n_samples=n_samples_tst, random_state=seeds[1])
    n_views = len(X_tr)

    Pi_empirical = get_empirical_pi(Y_tr, Pi.shape, scale='counts')

    runtimes = {}

    if to_exclude is None:
        to_exclude = []
    for m in to_exclude:
        assert m in ['bd_mvmm', 'sp_mvmm', 'log_pen_mvmm']

    #############################
    # covariance regularization #
    #############################
    n_views = len(X_tr)
    reg_covar = {}

    # set cov reg for each view
    for v in range(n_views):
        reg = default_cov_regularization(X=X_tr[v], mult=reg_covar_mult)

        models['view_gmms'][v].base_estimator.set_params(reg_covar=reg)

        models['full_mvmm'].base_view_models[v].set_params(reg_covar=reg)

        models['bd_mvmm'].base_estimator.base_start.base_view_models[v].\
            set_params(reg_covar=reg)
        models['bd_mvmm'].base_estimator.base_final.base_view_models[v].\
            set_params(reg_covar=reg)

        models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\
            set_params(reg_covar=reg)
        models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\
            set_params(reg_covar=reg)

        models['sp_mvmm'].base_mvmm_0.base_view_models[v].\
            set_params(reg_covar=reg)
        models['sp_mvmm'].base_wbd_mvmm.base_view_models[v].\
            set_params(reg_covar=reg)

        # print and save
        reg_covar[v] = reg
        res_writer.write(
            "\nCovarinace regularization for view {} is {}".format(v, reg))
        stds = X_tr[v].std(axis=0)
        res_writer.write("Smallest variance: {}".format(stds.min()**2))
        res_writer.write("Largest variance: {}".format(stds.max()**2))

    # for cat GMM
    reg = default_cov_regularization(X=np.hstack(X_tr), mult=reg_covar_mult)
    models['cat_gmm'].base_estimator.set_params(reg_covar=reg)
    reg_covar['cat_gmm'] = reg

    ##############
    # fit models #
    ##############

    # get classification resuls
    clf_results = {}
    start_time = time()
    clf_results['cat'] = clf_fit_and_score(clone(models['clf']),
                                           X_tr=np.hstack(X_tr),
                                           y_tr=view_labs_to_overall(Y_tr),
                                           X_tst=np.hstack(X_tst),
                                           y_tst=view_labs_to_overall(Y_tst))

    runtimes['cat'] = time() - start_time

    for v in range(n_views):
        start_time = time()
        clf_results['view_{}'.format(v)] =\
            clf_fit_and_score(clone(models['clf']),
                              X_tr=X_tr[v],
                              y_tr=Y_tr[:, v],
                              X_tst=X_tst[v],
                              y_tst=Y_tst[:, v])

        runtimes['clf_view_{}'.format(v)] = time() - start_time

    # fit clustering
    simplefilter('ignore', ConvergenceWarning)

    results_df = pd.DataFrame()

    sim_stub = {'mc_index': mc_index, 'n_samples': n_samples_tr}

    dists_cat = pairwise_distances(X=np.hstack(X_tr))

    dists_views = [pairwise_distances(X=X_tr[v]) for v in range(n_views)]

    kws = {
        'sim_stub': sim_stub,
        'X_tr': X_tr,
        'Y_tr': Y_tr,
        'X_tst': X_tst,
        'Y_tst': Y_tst,
        'Pi_true': Pi,
        'view_params_true': view_params,
        'zero_thresh': zero_thresh,
    }

    ###########
    # cat-GMM #
    ###########

    # print('start fitting cat-GMM at {}'.
    #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
    start_time = time()
    models['cat_gmm'].fit(np.hstack(X_tr))

    runtimes['cat_gmm'] = time() - start_time
    res_writer.write('fitting grid search cat-GMM took {:1.2f} seconds'.format(
        runtimes['cat_gmm']))

    results_df = add_gs_results(results_df=results_df,
                                model=models['cat_gmm'],
                                model_name='gmm_cat',
                                dataset='full',
                                view='both',
                                X_tr_precomp_dists=dists_cat,
                                **kws)

    #############
    # View GMMs #
    #############

    # print('start fitting view marginal GMMs at {}'.
    #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
    for v in range(n_views):
        start_time = time()
        models['view_gmms'][v].fit(X_tr[v])

        runtimes['gmm_view_{}'.format(v)] = time() - start_time
        res_writer.write(
            'fitting marginal view {} GMM took {:1.2f} seconds'.format(
                v, runtimes['gmm_view_{}'.format(v)]))

        # gmm fit on this view
        results_df = add_gs_results(results_df=results_df,
                                    model=models['view_gmms'][v],
                                    model_name='marginal_view_{}'.format(v),
                                    dataset='view',
                                    view=v,
                                    X_tr_precomp_dists=dists_views[v],
                                    **kws)
    #############
    # Full MVMM #
    #############
    # print('start fitting full MVMM at {}'.
    #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
    start_time = time()
    models['full_mvmm'].fit(X_tr)

    runtimes['full_mvmm'] = time() - start_time
    res_writer.write('fitting full mvmm took {:1.2f} seconds'.format(
        runtimes['full_mvmm']))

    results_df = add_gs_results(results_df=results_df,
                                model=models['full_mvmm'],
                                model_name='full_mvmm',
                                run_biptsp_on_full=True,
                                dataset='full',
                                view='both',
                                X_tr_precomp_dists=dists_cat,
                                **kws)

    for v in range(n_views):
        # add MVMM results for this view
        results_df = add_gs_results(
            results_df=results_df,
            model=models['full_mvmm'],
            model_name='full_mvmm',
            dataset='view',
            view=v,
            X_tr_precomp_dists=dists_views[v],  # TODO is this what we want
            **kws)
    ################
    # log pen MVMM #
    ################

    if 'log_pen_mvmm' not in to_exclude:
        # print('start fitting log pen grid search MVMM at {}'.
        #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
        start_time = time()
        models['log_pen_mvmm'].fit(X_tr)

        runtimes['log_pen_mvmm'] = time() - start_time
        res_writer.write('fitting grid search for log pen'
                         'mvmm took {:1.2f} seconds'.format(
                             runtimes['log_pen_mvmm']))

        results_df = add_gs_results(results_df=results_df,
                                    model=models['log_pen_mvmm'],
                                    model_name='log_pen_mvmm',
                                    dataset='full',
                                    view='both',
                                    X_tr_precomp_dists=dists_cat,
                                    **kws)

        for v in range(n_views):

            # add log pen MVMM results for this view
            results_df = add_gs_results(
                results_df=results_df,
                model=models['log_pen_mvmm'],
                model_name='log_pen_mvmm',
                dataset='view',
                view=v,
                X_tr_precomp_dists=dists_views[
                    v],  # TODO: is this what we want
                **kws)
    #######################
    # block diagonal MVMM #
    #######################

    if 'bd_mvmm' not in to_exclude:
        # print('start fitting block diag grid search MVMM at {}'.
        #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
        start_time = time()
        models['bd_mvmm'].fit(X_tr)

        runtimes['bd_mvmm'] = time() - start_time
        res_writer.write('fitting grid search for block'
                         'diag mvmm took {:1.2f} seconds'.format(
                             runtimes['bd_mvmm']))

        results_df = add_gs_results(results_df=results_df,
                                    model=models['bd_mvmm'],
                                    model_name='bd_mvmm',
                                    dataset='full',
                                    view='both',
                                    X_tr_precomp_dists=dists_cat,
                                    **kws)

        for v in range(n_views):
            # add bd MVMM results for this view
            results_df = add_gs_results(
                results_df=results_df,
                model=models['bd_mvmm'],
                model_name='bd_mvmm',
                dataset='view',
                view=v,
                X_tr_precomp_dists=dists_views[
                    v],  # TODO: is this what we want
                **kws)

    #########################
    # spectral penalty MVMM #
    #########################
    if 'sp_mvmm' not in to_exclude:
        # print('start fitting spectral penalty MVMM at {}'.
        #       format(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
        start_time = time()
        models['sp_mvmm'].fit(X_tr)

        runtimes['sp_mvmm'] = time() - start_time
        res_writer.write('fitting grid search for spect pen'
                         'mvmm took {:1.2f} seconds'.format(
                             runtimes['sp_mvmm']))

        results_df = add_gs_results(results_df=results_df,
                                    model=models['sp_mvmm'],
                                    model_name='sp_mvmm',
                                    dataset='full',
                                    view='both',
                                    X_tr_precomp_dists=dists_cat,
                                    **kws)

        for v in range(n_views):
            # add sp MVMM results for this view
            results_df = add_gs_results(
                results_df=results_df,
                model=models['sp_mvmm'],
                model_name='sp_mvmm',
                dataset='view',
                view=v,
                X_tr_precomp_dists=dists_views[
                    v],  # TODO: is this what we want
                **kws)

    ##########
    # oracle #
    ##########

    results_df = add_gs_results(
        results_df,
        model=models['oracle'],
        model_name='oracle',
        run_biptsp_on_full=False,  # or true?
        dataset='full',
        view='both',
        **kws)

    # Some formatting
    # ensure these columns are saved as integers
    int_cols = [
        'mc_index', 'best_tuning_idx', 'n_comp_est', 'n_comp_resid',
        'n_comp_tot_est', 'n_samples', 'tune_idx'
    ]
    results_df[int_cols] = results_df[int_cols].astype(int)

    # block diagonal summary of Pi estimates
    bd_summary = {}

    if 'log_pen_mvmm' not in to_exclude:
        _sim_stub = deepcopy(sim_stub)
        _sim_stub.update({'model_name': 'log_pen_mvmm'})
        bd_summary['log_pen_mvmm'] = \
            get_bd_summary_for_gs(_sim_stub, models['log_pen_mvmm'],
                                  zero_thresh=zero_thresh)

    if 'bd_mvmm' not in to_exclude:
        _sim_stub = deepcopy(sim_stub)
        _sim_stub.update({'model_name': 'bd_mvmm'})
        bd_summary['bd_mvmm'] = \
            get_bd_summary_for_gs(_sim_stub, models['bd_mvmm'],
                                  zero_thresh=zero_thresh)

    if 'sp_mvmm' not in to_exclude:
        _sim_stub = deepcopy(sim_stub)
        _sim_stub.update({'model_name': 'sp_mvmm'})
        bd_summary['sp_mvmm'] = \
            get_bd_summary_for_gs(_sim_stub, models['sp_mvmm'],
                                  zero_thresh=zero_thresh)

    res_writer.write(
        "Entire simulation took {:1.2f} seconds".format(time() -
                                                        overall_start_time))

    tr_data = {'X_tr': X_tr, 'Y_tr': Y_tr}

    return results_df, clf_results, models, bd_summary, Pi_empirical, tr_data,\
        runtimes
Example #6
0
filter_kws = {
    'tumor_sample_only': True,
    'primary_tumor_only': True,
    'keep_first_of_participant_multiples': True,
    'ensure_participant_idx': True,
    'verbose': True
}

raw_data_dir = TCGAPaths().raw_data_dir
pro_data_dir = make_and_get_dir(TCGAPaths().pro_data_dir, cancer_type)
feat_save_dir = make_and_get_dir(pro_data_dir, feat_list)

res_writer = ResultsWriter(join(feat_save_dir, 'log.txt'),
                           delete_if_exists=True)

res_writer.write(args)

fnames = {
    'mi_rna':
    'pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv',
    'rna': 'EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv',
    'dna_meth':
    'jhu-usc.edu_PANCAN_merged_HumanMethylation27_HumanMethylation450.betaValue_whitelisted.tsv',
    'cp': 'all_data_by_genes_whitelisted.tsv'
}

start_time = time()

##################
# load  metadata #
##################
Example #7
0
# stub = 'mvmm_fitting_{}_{}'.format(args.n_comp_v0, args.n_comp_v1)
stub = 'mvmm_fitting'
for nc in args.n_view_comps:
    stub += '_{}'.format(nc)

results_dir = args.results_dir
# results_dir = make_and_get_dir(args.top_dir, args.sim_name)
log_dir = make_and_get_dir(results_dir, 'log')
fitting_dir = make_and_get_dir(results_dir, 'model_fitting')
model_sel_dir = make_and_get_dir(results_dir, 'model_selection')
opt_diag_dir = make_and_get_dir(results_dir, 'opt_diagnostics')

res_writer = ResultsWriter(os.path.join(log_dir, '{}.txt'.format(stub)),
                           delete_if_exists=True)

res_writer.write(args)

run_start_time = time()

to_exclude = []
# if args.exclude_sp_mvmm:
#     to_exclude.append('sp_mvmm')
if args.exclude_bd_mvmm:
    to_exclude.append('bd_mvmm')
if args.exclude_log_pen_mvmm:
    to_exclude.append('log_pen_mvmm')

#############
# load data #
#############
n_views = len(args.fpaths)
Example #8
0
sel_models_fpath = os.path.join(fitting_dir, 'selected_models')
sel_models = {}

###########
# cat gmm #
###########
# if 'cat_gmm' in results['models'].keys():
if cat_model is not None:
    if cat_model.check_fit():

        estimator = cat_model.best_estimator_
        sel_models['cat_gmm'] = estimator

        # model selection
        est_n_comp = estimator.n_components
        res_writer.write("Cat data GMM estimated number of components: {}".
                         format(est_n_comp))

        # _model_sel_dir = make_and_get_dir(model_sel_dir, 'cat_gmm')
        # plt.figure(figsize=(inches, inches))
        plot_model_selection(cat_model, save_dir=model_sel_dir,
                             name_stub='cat_gmm',
                             title='GMM on concatenated data',
                             inches=inches)

        # optimization history
        loss_history = estimator.opt_data_['history']['obs_nll']
        plot_loss_history(loss_history,
                          loss_name='Observed data negative log-likelihood')
        save_fig(os.path.join(opt_diag_dir, 'cat_best_model_opt_history.png'))

    else: