Exemple #1
0
def est_age(input_pkl, age_column, analyses, output_csv):
    """Estimate age using cgAgeR"""
    import pandas as pd
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri, numpy2ri
    pandas2ri.activate()
    os.makedirs(output_csv[:output_csv.rfind('/')], exist_ok=True)
    methyl_array = MethylationArray.from_pickle(input_pkl)
    if age_column:
        age_column = pandas2ri.py2ri(methyl_array.pheno[age_column])
    else:
        age_column = robjects.r('NULL')
    run_analyses = dict(epitoc=False, horvath=False, hannum=False)
    for analysis in analyses:
        run_analyses[analysis] = True
    importr('cgageR')
    returned_ages = robjects.r(
        """function (beta, hannum, horvath, epitoc, age) {
                return(getAgeR(beta, epitoc=epitoc, horvath=horvath, hannum=hannum, chrage=age))
                }""")(methyl_array.beta.T, run_analyses['hannum'],
                      run_analyses['horvath'], run_analyses['epitoc'],
                      age_column)
    result_dfs = []
    return_data = lambda data_str: robjects.r(
        """function (results) results{}""".format(data_str))(returned_ages)
    if 'hannum' in analyses:
        result_dfs.append(
            pandas2ri.ri2py(
                return_data('$HannumClock.output$Hannum.Clock.Est')))
    if 'epitoc' in analyses:
        result_dfs.append(
            pandas2ri.ri2py(return_data('$EpiTOC.output$EpiTOC.Est')))
    if 'horvath' in analyses:
        result_dfs.append(
            pandas2ri.ri2py(return_data('$HorvathClock.output$Horvath.Est')))
    df = pd.concat(result_dfs, axis=1)
    df.index = methyl_array.pheno.index
    df.to_csv(output_csv)
    print(df)
Exemple #2
0
def generate_embed(input_pkl, output_generate_pkl, output_embed_pkl, cuda,
                   input_vae_pkl, stratify_column, n_workers, batch_size):
    import copy
    from methylnet.models import AutoEncoder, TybaltTitusVAE
    from methylnet.datasets import get_methylation_dataset
    import torch
    from torch.utils.data import DataLoader
    os.makedirs(os.path.dirname(output_generate_pkl), exist_ok=True)
    os.makedirs(os.path.dirname(output_embed_pkl), exist_ok=True)

    methyl_array = MethylationArray.from_pickle(
        input_pkl
    )  # generate results pickle to run through classification/regression report
    if cuda:
        model = torch.load(input_vae_pkl)
    else:
        model = torch.load(input_vae_pkl, map_location='cpu')
    test_methyl_dataset = get_methylation_dataset(copy.deepcopy(methyl_array),
                                                  stratify_column)
    test_methyl_dataloader = DataLoader(dataset=test_methyl_dataset,
                                        num_workers=n_workers,
                                        batch_size=min(
                                            batch_size,
                                            len(test_methyl_dataset)),
                                        shuffle=False)

    auto_encoder = AutoEncoder(autoencoder_model=model,
                               n_epochs=0,
                               loss_fn=None,
                               optimizer=None,
                               cuda=cuda,
                               kl_warm_up=None,
                               beta=None,
                               scheduler_opts={})
    Z, _, _ = auto_encoder.transform(test_methyl_dataloader)
    X_hat = auto_encoder.generate(test_methyl_dataloader)
    methyl_array.beta.iloc[:, :] = X_hat
    methyl_array.write_pickle(output_generate_pkl)
    methyl_array.beta = pd.DataFrame(Z, index=methyl_array.beta.index)
    methyl_array.write_pickle(output_embed_pkl)
Exemple #3
0
def model_capsnet_(
        train_methyl_array='train_val_test_sets/train_methyl_array.pkl',
        val_methyl_array='train_val_test_sets/val_methyl_array.pkl',
        interest_col='disease',
        n_epochs=10,
        n_bins=0,
        bin_len=1000000,
        min_capsule_len=300,
        primary_caps_out_len=45,
        caps_out_len=45,
        hidden_topology='30,80,50',
        gamma=1e-2,
        decoder_topology='100,300',
        learning_rate=1e-2,
        routing_iterations=3,
        overlap=0.,
        custom_loss='none',
        gamma2=1e-2,
        job=0,
        capsule_choice=['genomic_binned'],
        custom_capsule_file='',
        test_methyl_array='',
        predict=False,
        batch_size=16,
        limited_capsule_names_file='',
        gsea_superset='',
        tissue='',
        number_sets=25,
        use_set=False,
        gene_context=False,
        select_subtypes=[],
        fit_spw=False,
        l1_l2='',
        custom_capsule_file2='',
        min_capsules=5):

    capsule_choice = list(capsule_choice)
    #custom_capsule_file=list(custom_capsule_file)
    hlt_list = filter(None, hidden_topology.split(','))
    if hlt_list:
        hidden_topology = list(map(int, hlt_list))
    else:
        hidden_topology = []
    hlt_list = filter(None, decoder_topology.split(','))
    if hlt_list:
        decoder_topology = list(map(int, hlt_list))
    else:
        decoder_topology = []

    hidden_caps_layers = []
    include_last = False

    ma = MethylationArray.from_pickle(train_methyl_array)
    ma_v = MethylationArray.from_pickle(val_methyl_array)
    if test_methyl_array and predict:
        ma_t = MethylationArray.from_pickle(test_methyl_array)

    try:
        ma.remove_na_samples(interest_col)
        ma_v.remove_na_samples(interest_col)
        if test_methyl_array and predict:
            ma_t.remove_na_samples(interest_col)
    except:
        pass

    if select_subtypes:
        print(ma.pheno[interest_col].unique())
        ma.pheno = ma.pheno.loc[ma.pheno[interest_col].isin(select_subtypes)]
        ma.beta = ma.beta.loc[ma.pheno.index]
        ma_v.pheno = ma_v.pheno.loc[ma_v.pheno[interest_col].isin(
            select_subtypes)]
        ma_v.beta = ma_v.beta.loc[ma_v.pheno.index]
        print(ma.pheno[interest_col].unique())

        if test_methyl_array and predict:
            ma_t.pheno = ma_t.pheno.loc[ma_t.pheno[interest_col].isin(
                select_subtypes)]
            ma_t.beta = ma_t.beta.loc[ma_t.pheno.index]

    if custom_capsule_file2 and os.path.exists(custom_capsule_file2):
        capsules_dict = torch.load(custom_capsule_file2)
        final_modules, modulecpgs, module_names = capsules_dict[
            'final_modules'], capsules_dict['modulecpgs'], capsules_dict[
                'module_names']
        if min_capsule_len > 1:
            include_capsules = [
                len(x) > min_capsule_len for x in final_modules
            ]
            final_modules = [
                final_modules[i] for i in range(len(final_modules))
                if include_capsules[i]
            ]
            module_names = [
                module_names[i] for i in range(len(module_names))
                if include_capsules[i]
            ]
            modulecpgs = (reduce(np.union1d, final_modules)).tolist()

    else:
        final_modules, modulecpgs, module_names = build_capsules(
            capsule_choice, overlap, bin_len, ma, include_last,
            min_capsule_len, custom_capsule_file, gsea_superset, tissue,
            gene_context, use_set, number_sets, limited_capsule_names_file)
        if custom_capsule_file2:
            torch.save(
                dict(final_modules=final_modules,
                     modulecpgs=modulecpgs,
                     module_names=module_names), custom_capsule_file2)

    assert len(
        final_modules) >= min_capsules, "Below the number of allowed capsules."

    if fit_spw:
        modulecpgs = list(reduce(lambda x, y: np.hstack((x, y)),
                                 final_modules))

    if not include_last:  # ERROR HAPPENS HERE!
        ma.beta = ma.beta.loc[:, modulecpgs]
        ma_v.beta = ma_v.beta.loc[:, modulecpgs]
        if test_methyl_array and predict:
            ma_t.beta = ma_t.beta.loc[:, modulecpgs]
    # https://github.com/higgsfield/Capsule-Network-Tutorial/blob/master/Capsule%20Network.ipynb
    original_interest_col = interest_col
    if n_bins:
        new_interest_col = interest_col + '_binned'
        ma.pheno.loc[:,
                     new_interest_col], bins = pd.cut(ma.pheno[interest_col],
                                                      bins=n_bins,
                                                      retbins=True)
        ma_v.pheno.loc[:,
                       new_interest_col], _ = pd.cut(ma_v.pheno[interest_col],
                                                     bins=bins,
                                                     retbins=True)
        if test_methyl_array and predict:
            ma_t.pheno.loc[:, new_interest_col], _ = pd.cut(
                ma_t.pheno[interest_col], bins=bins, retbins=True)
        interest_col = new_interest_col

    datasets = dict()

    datasets['train'] = MethylationDataset(
        ma,
        interest_col,
        modules=final_modules,
        module_names=module_names,
        original_interest_col=original_interest_col,
        run_spw=fit_spw)
    print(datasets['train'].X.isnull().sum().sum())
    datasets['val'] = MethylationDataset(
        ma_v,
        interest_col,
        modules=final_modules,
        module_names=module_names,
        original_interest_col=original_interest_col,
        run_spw=fit_spw)
    if test_methyl_array and predict:
        datasets['test'] = MethylationDataset(
            ma_t,
            interest_col,
            modules=final_modules,
            module_names=module_names,
            original_interest_col=original_interest_col,
            run_spw=fit_spw)

    dataloaders = dict()

    dataloaders['train'] = DataLoader(datasets['train'],
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=8,
                                      pin_memory=True,
                                      drop_last=True)
    dataloaders['val'] = DataLoader(datasets['val'],
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=8,
                                    pin_memory=True,
                                    drop_last=False)
    n_primary = len(final_modules)
    if test_methyl_array and predict:
        dataloaders['test'] = DataLoader(datasets['test'],
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=8,
                                         pin_memory=True,
                                         drop_last=False)

    n_inputs = list(map(len, final_modules))

    n_out_caps = len(datasets['train'].y_unique)

    if not fit_spw:
        print("Not fitting MethylSPWNet")
        primary_caps = PrimaryCaps(modules=final_modules,
                                   hidden_topology=hidden_topology,
                                   n_output=primary_caps_out_len)
        hidden_caps = []
        output_caps = CapsLayer(n_out_caps,
                                n_primary,
                                primary_caps_out_len,
                                caps_out_len,
                                routing_iterations=routing_iterations)
        decoder = Decoder(n_out_caps * caps_out_len, len(list(ma.beta)),
                          decoder_topology)
        model = CapsNet(primary_caps,
                        hidden_caps,
                        output_caps,
                        decoder,
                        gamma=gamma)

        if test_methyl_array and predict:
            model.load_state_dict(torch.load('capsnet_model.pkl'))

    else:
        print("Fitting MethylSPWNet")
        module_lens = [len(x) for x in final_modules]
        model = MethylSPWNet(module_lens,
                             hidden_topology,
                             dropout_p=0.2,
                             n_output=n_out_caps)
        if test_methyl_array and predict:
            model.load_state_dict(torch.load('spwnet_model.pkl'))

    if torch.cuda.is_available():
        model = model.cuda()

    # extract all c_ij for all layers across all batches, or just last batch

    if l1_l2 and fit_spw:
        l1, l2 = list(map(float, l1_l2.split(',')))
    elif fit_spw:
        l1, l2 = 0., 0.

    trainer = Trainer(model=model,
                      validation_dataloader=dataloaders['val'],
                      n_epochs=n_epochs,
                      lr=learning_rate,
                      n_primary=n_primary,
                      custom_loss=custom_loss,
                      gamma2=gamma2,
                      spw_mode=fit_spw,
                      l1=l1 if fit_spw else 0.,
                      l2=l2 if fit_spw else 0.)

    if not predict:
        try:
            #assert 1==2
            trainer.fit(dataloader=dataloaders['train'])
            val_loss = min(trainer.val_losses)
            torch.save(
                trainer.model.state_dict(),
                'capsnet_model.pkl' if not fit_spw else 'spwnet_model.pkl')
            if fit_spw:
                torch.save(
                    dict(final_modules=final_modules,
                         modulecpgs=modulecpgs,
                         module_names=module_names), 'spwnet_capsules.pkl')
                torch.save(
                    dict(module_names=module_names,
                         module_lens=module_lens,
                         dropout_p=0.2,
                         hidden_topology=hidden_topology,
                         n_output=n_out_caps), 'spwnet_config.pkl')
        except Exception as e:
            print(e)
            val_loss = -2

        with sqlite3.connect('jobs.db', check_same_thread=False) as conn:
            pd.DataFrame([job, val_loss],
                         index=['job', 'val_loss'],
                         columns=[0]).T.to_sql('val_loss',
                                               conn,
                                               if_exists='append')
    else:
        if test_methyl_array:
            trainer.weights = 1.
            Y = trainer.predict(dataloaders['test'])
            pickle.dump(Y, open('predictions.pkl', 'wb'))
            val_loss = -1
    #print(val_loss)
    # print([min(trainer.val_losses),n_epochs,
    # 		n_bins,
    # 		bin_len,
    # 		min_capsule_len,
    # 		primary_caps_out_len,
    # 		caps_out_len,
    # 		hidden_topology,
    # 		gamma,
    # 		decoder_topology,
    # 		learning_rate,
    # 		routing_iterations])

    return val_loss
Exemple #4
0
def bin_column(test_pkl, col, n_bins, output_test_pkl):
    """Convert continuous phenotype column into categorical by binning."""
    os.makedirs(output_test_pkl[:output_test_pkl.rfind('/')], exist_ok=True)
    test_methyl_array = MethylationArray.from_pickle(test_pkl)
    new_col_name = test_methyl_array.bin_column(col, n_bins)
    test_methyl_array.write_pickle(output_test_pkl)
Exemple #5
0
def print_shape(input_pkl):
    """Print dimensions of beta matrix."""
    print(MethylationArray.from_pickle(input_pkl).beta.shape)
Exemple #6
0
def make_new_predictions(test_pkl, model_pickle, batch_size, n_workers,
                         interest_cols, categorical, cuda, categorical_encoder,
                         output_dir):
    """Run prediction model again to further assess outcome. Only evaluate prediction model."""
    os.makedirs(output_dir, exist_ok=True)
    test_methyl_array = MethylationArray.from_pickle(
        test_pkl
    )  # generate results pickle to run through classification/regression report
    if cuda:
        model = torch.load(model_pickle)
        model.vae.cuda_on = True
    else:
        model = torch.load(model_pickle, map_location='cpu')
        model.vae.cuda_on = False
    if not categorical:
        test_methyl_array.remove_na_samples(
            interest_cols if len(interest_cols) > 1 else interest_cols[0])
    if os.path.exists(categorical_encoder):
        categorical_encoder = pickle.load(open(categorical_encoder, 'rb'))
    else:
        categorical_encoder = None
    test_methyl_dataset = get_methylation_dataset(
        test_methyl_array,
        interest_cols,
        categorical=categorical,
        predict=True,
        categorical_encoder=categorical_encoder)
    test_methyl_dataloader = DataLoader(dataset=test_methyl_dataset,
                                        num_workers=n_workers,
                                        batch_size=min(
                                            batch_size,
                                            len(test_methyl_dataset)),
                                        shuffle=False)
    vae_mlp = MLPFinetuneVAE(mlp_model=model,
                             categorical=categorical,
                             cuda=cuda)

    Y_pred, Y_true, latent_projection, _ = vae_mlp.predict(
        test_methyl_dataloader)

    results = dict(test={})
    results['test']['y_pred'], results['test']['y_true'] = copy.deepcopy(
        Y_pred), copy.deepcopy(Y_true)

    if categorical:
        Y_true = Y_true.argmax(axis=1)[:, np.newaxis]
        Y_pred = Y_pred.argmax(axis=1)[:, np.newaxis]
    test_methyl_array = test_methyl_dataset.to_methyl_array()

    Y_pred = pd.DataFrame(
        Y_pred.flatten() if (np.array(Y_pred.shape) == 1).any() else Y_pred,
        index=test_methyl_array.beta.index,
        columns=(['y_pred'] if categorical else interest_cols))
    Y_true = pd.DataFrame(
        Y_true.flatten() if (np.array(Y_true.shape) == 1).any() else Y_true,
        index=test_methyl_array.beta.index,
        columns=(['y_true'] if categorical else interest_cols))
    results_df = pd.concat([
        Y_pred, Y_true
    ], axis=1) if categorical else pd.concat([
        Y_pred.rename(columns={name: name + '_pred'
                               for name in list(Y_pred)}),
        Y_true.rename(columns={name: name + '_true'
                               for name in list(Y_pred)})
    ],
                                             axis=1)  # FIXME
    latent_projection = pd.DataFrame(latent_projection,
                                     index=test_methyl_array.beta.index)
    test_methyl_array.beta = latent_projection

    output_file = join(output_dir, 'results.csv')
    results_file = join(output_dir, 'results.p')
    output_file_latent = join(output_dir, 'latent.csv')
    output_pkl = join(output_dir, 'vae_mlp_methyl_arr.pkl')

    test_methyl_array.write_pickle(output_pkl)
    pickle.dump(results, open(results_file, 'wb'))
    latent_projection.to_csv(output_file_latent)
    results_df.to_csv(output_file)
Exemple #7
0
def train_predict(train_pkl,
                  test_pkl,
                  input_vae_pkl,
                  output_dir,
                  cuda,
                  interest_cols,
                  categorical,
                  disease_only,
                  hidden_layer_topology,
                  learning_rate_vae,
                  learning_rate_mlp,
                  weight_decay,
                  dropout_p,
                  n_epochs,
                  scheduler='null',
                  decay=0.5,
                  t_max=10,
                  eta_min=1e-6,
                  t_mult=2,
                  batch_size=50,
                  val_pkl='val_methyl_array.pkl',
                  n_workers=8,
                  add_validation_set=False,
                  loss_reduction='sum',
                  add_softmax=False):
    os.makedirs(output_dir, exist_ok=True)

    output_file = join(output_dir, 'results.csv')
    training_curve_file = join(output_dir, 'training_val_curve.p')
    results_file = join(output_dir, 'results.p')
    output_file_latent = join(output_dir, 'latent.csv')
    output_model = join(output_dir, 'output_model.p')
    output_pkl = join(output_dir, 'vae_mlp_methyl_arr.pkl')
    output_onehot_encoder = join(output_dir, 'one_hot_encoder.p')

    #input_dict = pickle.load(open(input_pkl,'rb'))
    if cuda:
        vae_model = torch.load(input_vae_pkl)
        vae_model.cuda_on = True
    else:
        vae_model = torch.load(input_vae_pkl, map_location='cpu')
        vae_model.cuda_on = False

    train_methyl_array, val_methyl_array, test_methyl_array = MethylationArray.from_pickle(
        train_pkl
    ), MethylationArray.from_pickle(val_pkl), MethylationArray.from_pickle(
        test_pkl
    )  #methyl_array.split_train_test(train_p=train_percent, stratified=(True if categorical else False), disease_only=disease_only, key=interest_cols[0], subtype_delimiter=',')

    if not categorical:
        train_methyl_array.remove_na_samples(interest_cols)
        val_methyl_array.remove_na_samples(interest_cols)
        test_methyl_array.remove_na_samples(interest_cols)

    print(train_methyl_array.beta.shape)
    print(val_methyl_array.beta.shape)
    print(test_methyl_array.beta.shape)

    if len(interest_cols) == 1 and disease_only and interest_cols[0].endswith(
            '_only') == False:
        print(interest_cols)
        interest_cols[0] += '_only'
        print(train_methyl_array.pheno[interest_cols[0]].unique())
        print(test_methyl_array.pheno[interest_cols[0]].unique())

    train_methyl_dataset = get_methylation_dataset(
        train_methyl_array,
        interest_cols,
        categorical=categorical,
        predict=True)  # train, test split? Add val set?
    #print(list(train_methyl_dataset.encoder.get_feature_names()))
    val_methyl_dataset = get_methylation_dataset(
        val_methyl_array,
        interest_cols,
        categorical=categorical,
        predict=True,
        categorical_encoder=train_methyl_dataset.encoder)
    test_methyl_dataset = get_methylation_dataset(
        test_methyl_array,
        interest_cols,
        categorical=categorical,
        predict=True,
        categorical_encoder=train_methyl_dataset.encoder)

    if not batch_size:
        batch_size = len(train_methyl_dataset)
    train_batch_size = min(batch_size, len(train_methyl_dataset))
    val_batch_size = min(batch_size, len(val_methyl_dataset))

    train_methyl_dataloader = DataLoader(dataset=train_methyl_dataset,
                                         num_workers=n_workers,
                                         batch_size=train_batch_size,
                                         shuffle=True)

    val_methyl_dataloader = DataLoader(dataset=val_methyl_dataset,
                                       num_workers=n_workers,
                                       batch_size=val_batch_size,
                                       shuffle=True)  # False

    test_methyl_dataloader = DataLoader(dataset=test_methyl_dataset,
                                        num_workers=n_workers,
                                        batch_size=min(
                                            batch_size,
                                            len(test_methyl_dataset)),
                                        shuffle=False)

    scaling_factors = dict(
        val=float(len(val_methyl_dataset)) /
        ((len(val_methyl_dataset) // val_batch_size) * val_batch_size),
        train_batch_size=train_batch_size,
        val_batch_size=val_batch_size)

    model = VAE_MLP(vae_model=vae_model,
                    categorical=categorical,
                    hidden_layer_topology=hidden_layer_topology,
                    n_output=train_methyl_dataset.outcome_col.shape[1],
                    dropout_p=dropout_p,
                    add_softmax=add_softmax)

    class_weights = []
    if categorical:
        out_weight = Counter(
            np.argmax(train_methyl_dataset.outcome_col, axis=1))
        #total_samples=sum(out_weight.values())
        for k in sorted(list(out_weight.keys())):
            class_weights.append(1. / float(out_weight[k]))  # total_samples
        class_weights = np.array(class_weights)
        class_weights = (class_weights / class_weights.sum()).tolist()
        print(class_weights)

    if class_weights:
        class_weights = torch.FloatTensor(class_weights)
        if cuda:
            class_weights = class_weights.cuda()
    else:
        class_weights = None

    optimizer_vae = torch.optim.Adam(model.vae.parameters(),
                                     lr=learning_rate_vae,
                                     weight_decay=weight_decay)
    optimizer_mlp = torch.optim.Adam(model.mlp.parameters(),
                                     lr=learning_rate_mlp,
                                     weight_decay=weight_decay)
    loss_fn = CrossEntropyLoss(
        reduction=loss_reduction,
        weight=class_weights) if categorical else MSELoss(
            reduction=loss_reduction)  # 'sum'
    scheduler_opts = dict(scheduler=scheduler,
                          lr_scheduler_decay=decay,
                          T_max=t_max,
                          eta_min=eta_min,
                          T_mult=t_mult)
    vae_mlp = MLPFinetuneVAE(mlp_model=model,
                             n_epochs=n_epochs,
                             categorical=categorical,
                             loss_fn=loss_fn,
                             optimizer_vae=optimizer_vae,
                             optimizer_mlp=optimizer_mlp,
                             cuda=cuda,
                             scheduler_opts=scheduler_opts)
    if add_validation_set:
        vae_mlp.add_validation_set(val_methyl_dataloader)
    vae_mlp = vae_mlp.fit(train_methyl_dataloader)
    if 'encoder' in dir(train_methyl_dataset):
        pickle.dump(train_methyl_dataset.encoder,
                    open(output_onehot_encoder, 'wb'))
    results = dict(test={}, train={}, val={})
    results['train']['y_pred'], results['train'][
        'y_true'], _, _ = vae_mlp.predict(train_methyl_dataloader)
    results['val']['y_pred'], results['val']['y_true'], _, _ = vae_mlp.predict(
        val_methyl_dataloader)
    del train_methyl_dataloader, train_methyl_dataset
    """methyl_dataset=get_methylation_dataset(methyl_array,interest_cols,predict=True)
    methyl_dataset_loader = DataLoader(
        dataset=methyl_dataset,
        num_workers=9,
        batch_size=1,
        shuffle=False)"""
    Y_pred, Y_true, latent_projection, _ = vae_mlp.predict(
        test_methyl_dataloader
    )  # FIXME change to include predictions for all classes for AUC
    results['test']['y_pred'], results['test']['y_true'] = copy.deepcopy(
        Y_pred), copy.deepcopy(Y_true)
    if categorical:
        Y_true = Y_true.argmax(axis=1)[:, np.newaxis]
        Y_pred = Y_pred.argmax(axis=1)[:, np.newaxis]
    test_methyl_array = test_methyl_dataset.to_methyl_array()
    """if categorical:
        Y_true=test_methyl_dataset.encoder.inverse_transform(Y_true)[:,np.newaxis]
        Y_pred=test_methyl_dataset.encoder.inverse_transform(Y_pred)[:,np.newaxis]"""
    #sample_names = np.array(list(test_methyl_array.beta.index)) # FIXME
    #outcomes = np.array([outcome[0] for outcome in outcomes]) # FIXME
    Y_pred = pd.DataFrame(
        Y_pred.flatten() if (np.array(Y_pred.shape) == 1).any() else Y_pred,
        index=test_methyl_array.beta.index,
        columns=(['y_pred'] if categorical else
                 interest_cols))  #dict(zip(sample_names,outcomes))
    Y_true = pd.DataFrame(
        Y_true.flatten() if (np.array(Y_true.shape) == 1).any() else Y_true,
        index=test_methyl_array.beta.index,
        columns=(['y_true'] if categorical else interest_cols))
    results_df = pd.concat([
        Y_pred, Y_true
    ], axis=1) if categorical else pd.concat([
        Y_pred.rename(columns={name: name + '_pred'
                               for name in list(Y_pred)}),
        Y_true.rename(columns={name: name + '_true'
                               for name in list(Y_pred)})
    ],
                                             axis=1)  # FIXME
    latent_projection = pd.DataFrame(latent_projection,
                                     index=test_methyl_array.beta.index)
    test_methyl_array.beta = latent_projection
    test_methyl_array.write_pickle(output_pkl)
    pickle.dump(results, open(results_file, 'wb'))
    pickle.dump(vae_mlp.training_plot_data, open(training_curve_file, 'wb'))
    latent_projection.to_csv(output_file_latent)
    torch.save(vae_mlp.model, output_model)
    results_df.to_csv(
        output_file)  #pickle.dump(outcome_dict, open(outcome_dict_file,'wb'))
    return latent_projection, Y_pred, Y_true, vae_mlp, scaling_factors
Exemple #8
0
def embed_vae(train_pkl,
              output_dir,
              cuda,
              n_latent,
              lr,
              weight_decay,
              n_epochs,
              hidden_layer_encoder_topology,
              kl_warm_up=0,
              beta=1.,
              scheduler='null',
              decay=0.5,
              t_max=10,
              eta_min=1e-6,
              t_mult=2,
              bce_loss=False,
              batch_size=50,
              val_pkl='val_methyl_array.pkl',
              n_workers=9,
              convolutional=False,
              height_kernel_sizes=[],
              width_kernel_sizes=[],
              add_validation_set=False,
              loss_reduction='sum',
              stratify_column='disease'):
    from methylnet.models import AutoEncoder, TybaltTitusVAE
    from methylnet.datasets import get_methylation_dataset
    import torch
    from torch.utils.data import DataLoader
    from torch.nn import MSELoss, BCELoss
    os.makedirs(output_dir, exist_ok=True)

    output_file = join(output_dir, 'output_latent.csv')
    output_model = join(output_dir, 'output_model.p')
    training_curve_file = join(output_dir, 'training_val_curve.p')
    outcome_dict_file = join(output_dir, 'output_outcomes.p')
    output_pkl = join(output_dir, 'vae_methyl_arr.pkl')

    #input_dict = pickle.load(open(input_pkl,'rb'))
    #methyl_array=MethylationArray(*extract_pheno_beta_df_from_pickle_dict(input_dict))
    #print(methyl_array.beta)
    train_methyl_array, val_methyl_array = MethylationArray.from_pickle(
        train_pkl
    ), MethylationArray.from_pickle(
        val_pkl
    )  #methyl_array.split_train_test(train_p=train_percent, stratified=True, disease_only=True, key='disease', subtype_delimiter=',')

    train_methyl_dataset = get_methylation_dataset(
        train_methyl_array, stratify_column)  # train, test split? Add val set?

    val_methyl_dataset = get_methylation_dataset(val_methyl_array,
                                                 stratify_column)

    if not batch_size:
        batch_size = len(methyl_dataset)

    train_batch_size = min(batch_size, len(train_methyl_dataset))
    val_batch_size = min(batch_size, len(val_methyl_dataset))

    train_methyl_dataloader = DataLoader(
        dataset=train_methyl_dataset,
        num_workers=n_workers,  #n_workers
        batch_size=train_batch_size,
        shuffle=True,
        pin_memory=False)

    val_methyl_dataloader = DataLoader(dataset=val_methyl_dataset,
                                       num_workers=n_workers,
                                       batch_size=val_batch_size,
                                       shuffle=True,
                                       pin_memory=False)

    scaling_factors = dict(
        train=float(len(train_methyl_dataset)) /
        ((len(train_methyl_dataset) // train_batch_size) * train_batch_size),
        val=float(len(val_methyl_dataset)) /
        ((len(val_methyl_dataset) // val_batch_size) * val_batch_size),
        train_batch_size=train_batch_size,
        val_batch_size=val_batch_size)
    print('SCALE', len(train_methyl_dataset), len(val_methyl_dataset),
          train_batch_size, val_batch_size, scaling_factors)
    n_input = train_methyl_array.return_shape()[1]
    if not convolutional:
        model = TybaltTitusVAE(
            n_input=n_input,
            n_latent=n_latent,
            hidden_layer_encoder_topology=hidden_layer_encoder_topology,
            cuda=cuda)
    else:
        model = CVAE(n_latent=n_latent,
                     in_shape=methyl_dataset.new_shape,
                     kernel_heights=height_kernel_sizes,
                     kernel_widths=width_kernel_sizes,
                     n_pre_latent=n_latent * 2)  # change soon

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    loss_fn = BCELoss(reduction=loss_reduction) if bce_loss else MSELoss(
        reduction=loss_reduction)  # 'sum'
    scheduler_opts = dict(scheduler=scheduler,
                          lr_scheduler_decay=decay,
                          T_max=t_max,
                          eta_min=eta_min,
                          T_mult=t_mult)
    auto_encoder = AutoEncoder(autoencoder_model=model,
                               n_epochs=n_epochs,
                               loss_fn=loss_fn,
                               optimizer=optimizer,
                               cuda=cuda,
                               kl_warm_up=kl_warm_up,
                               beta=beta,
                               scheduler_opts=scheduler_opts)
    if add_validation_set:
        auto_encoder.add_validation_set(val_methyl_dataloader)
    auto_encoder = auto_encoder.fit(train_methyl_dataloader)
    train_methyl_array = train_methyl_dataset.to_methyl_array()
    val_methyl_array = val_methyl_dataset.to_methyl_array()
    del val_methyl_dataloader, train_methyl_dataloader, val_methyl_dataset, train_methyl_dataset

    methyl_dataset = get_methylation_dataset(
        MethylationArrays([train_methyl_array, val_methyl_array]).combine(),
        stratify_column)
    methyl_dataset_loader = DataLoader(dataset=methyl_dataset,
                                       num_workers=n_workers,
                                       batch_size=1,
                                       shuffle=False)
    latent_projection, _, _ = auto_encoder.transform(methyl_dataset_loader)
    #print(latent_projection.shape)
    methyl_array = methyl_dataset.to_methyl_array()
    #sample_names = np.array([sample_name[0] for sample_name in sample_names]) # FIXME
    #outcomes = np.array([outcome[0] for outcome in outcomes]) # FIXME
    #outcome_dict=dict(zip(sample_names,outcomes))
    #print(methyl_array.beta)
    latent_projection = pd.DataFrame(latent_projection,
                                     index=methyl_array.beta.index)
    methyl_array.beta = latent_projection
    methyl_array.write_pickle(output_pkl)
    latent_projection.to_csv(output_file)
    pickle.dump(auto_encoder.training_plot_data, open(training_curve_file,
                                                      'wb'))
    torch.save(auto_encoder.model, output_model)
    #pickle.dump(outcome_dict, open(outcome_dict_file,'wb'))
    return latent_projection, None, scaling_factors, n_input, auto_encoder
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--interest_col', type=str)
    p.add_argument('--n_bins', type=int)
    args = p.parse_args()
    bin_len = 1000000
    min_capsule_len = 350
    interest_col = args.interest_col
    n_bins = args.n_bins

    primary_caps_out_len = 40
    caps_out_len = 20
    n_epochs = 500
    hidden_topology = [30, 80, 50]
    gamma = 1e-2
    decoder_top = [100, 300]
    lr = 1e-3
    routing_iterations = 3

    if not os.path.exists('hg19.{}.bed'.format(bin_len)):
        BedTool('hg19.genome').makewindows(g='hg19.genome', w=bin_len).saveas(
            'hg19.{}.bed'.format(bin_len))  #.to_dataframe().shape

    ma = MethylationArray.from_pickle(
        'train_val_test_sets/train_methyl_array.pkl')
    ma_v = MethylationArray.from_pickle(
        'train_val_test_sets/val_methyl_array.pkl')

    include_last = False

    @pysnooper.snoop('get_mod.log')
    def get_final_modules(ma=ma,
                          a='450kannotations.bed',
                          b='lola_vignette_data/activeDHS_universe.bed',
                          include_last=False,
                          min_capsule_len=2000):
        allcpgs = ma.beta.columns.values
        df = BedTool(a).to_dataframe()
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).map(
            lambda x: 'chr' + x.split('.')[0])
        df = df.set_index('name').loc[list(
            ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
        df_bed = pd.read_table(b, header=None)
        df_bed['features'] = np.arange(df_bed.shape[0])
        df_bed = df_bed.iloc[:, [0, 1, 2, -1]]
        b = BedTool.from_dataframe(df)
        a = BedTool.from_dataframe(
            df_bed)  #('lola_vignette_data/activeDHS_universe.bed')
        c = a.intersect(b, wa=True, wb=True).sort()
        d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
        df2 = d.to_dataframe()
        df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len]
        modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values]
        modulecpgs = np.array(
            list(set(list(reduce(lambda x, y: x + y, modules)))))
        if include_last:
            missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist()
        final_modules = modules + ([missing_cpgs] if include_last else [])
        module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) +
                        '_' + df3.iloc[:, 2].astype(str)).tolist()
        return final_modules, modulecpgs, module_names

    final_modules, modulecpgs, module_names = get_final_modules(
        b='hg19.{}.bed'.format(bin_len),
        include_last=include_last,
        min_capsule_len=min_capsule_len)
    print('LEN_MODULES', len(final_modules))

    if not include_last:
        ma.beta = ma.beta.loc[:, modulecpgs]
        ma_v.beta = ma_v.beta.loc[:, modulecpgs]
    # https://github.com/higgsfield/Capsule-Network-Tutorial/blob/master/Capsule%20Network.ipynb

    def softmax(input_tensor, dim=1):
        # transpose input
        transposed_input = input_tensor.transpose(dim,
                                                  len(input_tensor.size()) - 1)
        # calculate softmax
        softmaxed_output = F.softmax(transposed_input.contiguous().view(
            -1, transposed_input.size(-1)),
                                     dim=-1)
        # un-transpose result
        return softmaxed_output.view(*transposed_input.size()).transpose(
            dim,
            len(input_tensor.size()) - 1)

    class MLP(
            nn.Module
    ):  # add latent space extraction, and spits out csv line of SQL as text for UMAP
        def __init__(self,
                     n_input,
                     hidden_topology,
                     dropout_p,
                     n_outputs=1,
                     binary=False,
                     softmax=False):
            super(MLP, self).__init__()
            self.hidden_topology = hidden_topology
            self.topology = [n_input] + hidden_topology + [n_outputs]
            layers = [
                nn.Linear(self.topology[i], self.topology[i + 1])
                for i in range(len(self.topology) - 2)
            ]
            for layer in layers:
                torch.nn.init.xavier_uniform_(layer.weight)
            self.layers = [
                nn.Sequential(layer, nn.ReLU(), nn.Dropout(p=dropout_p))
                for layer in layers
            ]
            self.output_layer = nn.Linear(self.topology[-2], self.topology[-1])
            torch.nn.init.xavier_uniform_(self.output_layer.weight)
            if binary:
                output_transform = nn.Sigmoid()
            elif softmax:
                output_transform = nn.Softmax()
            else:
                output_transform = nn.Dropout(p=0.)
            self.layers.append(
                nn.Sequential(self.output_layer, output_transform))
            self.mlp = nn.Sequential(*self.layers)

        def forward(self, x):
            #print(x.shape)
            return self.mlp(x)

    class MethylationDataset(Dataset):
        def __init__(self,
                     methyl_arr,
                     outcome_col,
                     binarizer=None,
                     modules=[]):
            if binarizer == None:
                binarizer = LabelBinarizer()
                binarizer.fit(methyl_arr.pheno[outcome_col].astype(str).values)
            self.y = binarizer.transform(
                methyl_arr.pheno[outcome_col].astype(str).values)
            self.y_unique = np.unique(np.argmax(self.y, 1))
            self.binarizer = binarizer
            if not modules:
                modules = [list(methyl_arr.beta)]
            self.modules = modules
            self.X = methyl_arr.beta
            self.length = methyl_arr.beta.shape[0]

        def __len__(self):
            return self.length

        def __getitem__(self, i):
            return tuple([torch.FloatTensor(self.X.iloc[i].values)] + [
                torch.FloatTensor(self.X.iloc[i].loc[module].values)
                for module in self.modules
            ] + [torch.FloatTensor(self.y[i])])

    class PrimaryCaps(nn.Module):
        def __init__(self, modules, hidden_topology, n_output):
            super(PrimaryCaps, self).__init__()
            self.capsules = nn.ModuleList([
                MLP(len(module), hidden_topology, 0., n_outputs=n_output)
                for module in modules
            ])

        def forward(self, x):
            #print(self.capsules)
            u = [self.capsules[i](x[i]) for i in range(len(self.capsules))]
            u = torch.stack(u, dim=1)
            #print(u.size())
            return self.squash(u)

        def squash(self, x):
            squared_norm = (x**2).sum(-1, keepdim=True)
            #print('prim_norm',squared_norm.size())
            output_tensor = squared_norm * x / (
                (1. + squared_norm) * torch.sqrt(squared_norm))
            #print('z_init',output_tensor.size())
            return output_tensor

        def get_weights(self):
            return list(
                self.capsules[0].parameters()
            )[0].data  #self.state_dict()#[self.capsules[i].state_dict() for i in range(len(self.capsules))]

    class CapsLayer(nn.Module):
        def __init__(self,
                     n_capsules,
                     n_routes,
                     n_input,
                     n_output,
                     routing_iterations=3):
            super(CapsLayer, self).__init__()
            self.n_capsules = n_capsules
            self.num_routes = n_routes
            self.W = nn.Parameter(
                torch.randn(1, n_routes, n_capsules, n_output, n_input))
            self.routing_iterations = routing_iterations
            self.c_ij = None

        def forward(self, x):
            batch_size = x.size(0)
            x = torch.stack([x] * self.n_capsules, dim=2).unsqueeze(4)

            W = torch.cat([self.W] * batch_size, dim=0)
            #print('affine',W.size(),x.size())
            u_hat = torch.matmul(W, x)
            #print('affine_trans',u_hat.size())

            b_ij = Variable(torch.zeros(1, self.num_routes, self.n_capsules,
                                        1))

            if torch.cuda.is_available():
                b_ij = b_ij.cuda()

            for iteration in range(self.routing_iterations):
                self.c_ij = softmax(b_ij)
                #print(c_ij)
                c_ij = torch.cat([self.c_ij] * batch_size, dim=0).unsqueeze(4)
                #print('coeff',c_ij.size())#[0,:,0,:])#.size())

                s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
                v_j = self.squash(s_j)
                #print('z',v_j.size())

                if iteration < self.routing_iterations - 1:
                    a_ij = torch.matmul(
                        u_hat.transpose(3, 4),
                        torch.cat([v_j] * self.num_routes, dim=1))
                    b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)

            return v_j.squeeze(1)

        def return_routing_coef(self):
            return self.c_ij

        def squash(self, x):
            #print(x.size())
            squared_norm = (x**2).sum(-1, keepdim=True)
            #print('norm',squared_norm.size())
            output_tensor = squared_norm * x / (
                (1. + squared_norm) * torch.sqrt(squared_norm))
            return output_tensor

    class Decoder(nn.Module):
        def __init__(self, n_input, n_output, hidden_topology):
            super(Decoder, self).__init__()
            self.decoder = MLP(n_input,
                               hidden_topology,
                               0.,
                               n_outputs=n_output,
                               binary=True)

        def forward(self, x):
            return self.decoder(x)

    class CapsNet(nn.Module):
        def __init__(self,
                     primary_caps,
                     caps_hidden_layers,
                     caps_output_layer,
                     decoder,
                     lr_balance=0.5,
                     gamma=0.005):
            super(CapsNet, self).__init__()
            self.primary_caps = primary_caps
            self.caps_hidden_layers = caps_hidden_layers
            self.caps_output_layer = caps_output_layer
            self.decoder = decoder
            self.recon_loss_fn = nn.BCELoss()
            self.lr_balance = lr_balance
            self.gamma = gamma

        def forward(self, x_orig, modules_input):
            x = self.primary_caps(modules_input)
            primary_caps_out = x  #.view(x.size(0),x.size(1)*x.size(2))
            #print(x.size())
            for layer in self.caps_hidden_layers:
                x = layer(x)

            y_pred = self.caps_output_layer(x)  #.squeeze(-1)
            #print(y_pred.shape)

            classes = torch.sqrt((y_pred**2).sum(2))
            classes = F.softmax(classes)

            max_length_indices = classes.argmax(dim=1)
            masked = torch.sparse.torch.eye(self.caps_output_layer.n_capsules)
            if torch.cuda.is_available():
                masked = masked.cuda()
            masked = masked.index_select(
                dim=0, index=max_length_indices.squeeze(1).data)

            embedding = (y_pred * masked[:, :, None, None]).view(
                y_pred.size(0), -1)

            #print(y_pred.size())
            x_hat = self.decoder(embedding)  #.reshape(y_pred.size(0),-1))
            return x_orig, x_hat, y_pred, embedding, primary_caps_out

        def recon_loss(self, x_orig, x_hat):
            return self.recon_loss_fn(x_hat, x_orig)

        def margin_loss(self, x, labels):
            batch_size = x.size(0)

            v_c = torch.sqrt((x**2).sum(dim=2, keepdim=True))

            #print(v_c)

            left = (F.relu(0.9 - v_c)**2).view(batch_size, -1)
            right = (F.relu(v_c - 0.1)**2).view(batch_size, -1)
            #print(left)
            #print(right)
            #print(labels)

            loss = labels * left + self.lr_balance * (1.0 - labels) * right
            #print(loss.shape)
            loss = loss.sum(dim=1).mean()
            return loss

        def calculate_loss(self, x_orig, x_hat, y_pred, y_true):
            margin_loss = self.margin_loss(y_pred, y_true)
            recon_loss = self.gamma * self.recon_loss(x_orig, x_hat)
            loss = margin_loss + recon_loss
            return loss, margin_loss, recon_loss

    if n_bins:
        ma.pheno.loc[:, interest_col], bins = pd.cut(ma.pheno[interest_col],
                                                     bins=n_bins,
                                                     retbins=True)
        ma_v.pheno.loc[:, interest_col], bins = pd.cut(
            ma_v.pheno[interest_col],
            bins=bins,
            retbins=True,
        )

    dataset = MethylationDataset(ma, interest_col, modules=final_modules)
    dataset_v = MethylationDataset(ma_v, interest_col, modules=final_modules)

    dataloader = DataLoader(dataset,
                            batch_size=16,
                            shuffle=True,
                            num_workers=8,
                            drop_last=True)
    dataloader_v = DataLoader(dataset_v,
                              batch_size=16,
                              shuffle=False,
                              num_workers=8,
                              drop_last=False)

    n_inputs = list(map(len, final_modules))
    n_primary = len(final_modules)

    primary_caps = PrimaryCaps(modules=final_modules,
                               hidden_topology=hidden_topology,
                               n_output=primary_caps_out_len)
    hidden_caps = []
    n_out_caps = len(dataset.y_unique)
    output_caps = CapsLayer(n_out_caps,
                            n_primary,
                            primary_caps_out_len,
                            caps_out_len,
                            routing_iterations=routing_iterations)
    decoder = Decoder(n_out_caps * caps_out_len, len(list(ma.beta)),
                      decoder_top)
    capsnet = CapsNet(primary_caps,
                      hidden_caps,
                      output_caps,
                      decoder,
                      gamma=gamma)

    if torch.cuda.is_available():
        capsnet = capsnet.cuda()

    for d in ['figures/embeddings' + x for x in ['', '2', '3']]:
        os.makedirs(d, exist_ok=True)
    os.makedirs('results/routing_weights', exist_ok=True)
    # extract all c_ij for all layers across all batches, or just last batch
    optimizer = Adam(capsnet.parameters(), lr)
    scheduler = CosineAnnealingLR(optimizer,
                                  T_max=10,
                                  eta_min=0,
                                  last_epoch=-1)
    for epoch in range(n_epochs):
        print(epoch)
        capsnet.train(True)
        running_loss = 0.
        Y = {'true': [], 'pred': []}
        for i, batch in enumerate(dataloader):
            x_orig = batch[0]
            #print(x_orig)
            y_true = batch[-1]
            module_x = batch[1:-1]
            if torch.cuda.is_available():
                x_orig = x_orig.cuda()
                y_true = y_true.cuda()
                module_x = [mod.cuda() for mod in module_x]
            x_orig, x_hat, y_pred, embedding, primary_caps_out = capsnet(
                x_orig, module_x)
            loss, margin_loss, recon_loss = capsnet.calculate_loss(
                x_orig, x_hat, y_pred, y_true)
            Y['true'].extend(y_true.argmax(1).detach().cpu().numpy().tolist())
            Y['pred'].extend(
                F.softmax(torch.sqrt(
                    (y_pred**2
                     ).sum(2))).argmax(1).detach().cpu().numpy().tolist())
            train_loss = margin_loss.item()  #print(loss)
            running_loss += train_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        #print(capsnet.primary_caps.get_weights())
        running_loss /= (i + 1)
        print('Epoch {}: Train Loss {}, Train R2: {}, Train MAE: {}'.format(
            epoch, running_loss, r2_score(Y['true'], Y['pred']),
            mean_absolute_error(Y['true'], Y['pred'])))
        print(classification_report(Y['true'], Y['pred']))
        scheduler.step()
        capsnet.train(False)
        running_loss = np.zeros((3, )).astype(float)
        Y = {
            'true': [],
            'pred': [],
            'embeddings': [],
            'embeddings2': [],
            'embeddings3': [],
            'routing_weights': []
        }
        with torch.no_grad():
            for i, batch in enumerate(dataloader_v):
                x_orig = batch[0]
                y_true = batch[-1]
                module_x = batch[1:-1]
                if torch.cuda.is_available():
                    x_orig = x_orig.cuda()
                    y_true = y_true.cuda()
                    module_x = [mod.cuda() for mod in module_x]
                x_orig, x_hat, y_pred, embedding, primary_caps_out = capsnet(
                    x_orig, module_x)
                #print(primary_caps_out.size())
                routing_coefs = capsnet.caps_output_layer.return_routing_coef(
                ).detach().cpu().numpy()
                if not i:
                    Y['routing_weights'] = pd.DataFrame(
                        routing_coefs[0, ..., 0].T,
                        index=dataset.binarizer.classes_,
                        columns=module_names)
                else:
                    Y['routing_weights'] += pd.DataFrame(
                        routing_coefs[0, ..., 0].T,
                        index=dataset.binarizer.classes_,
                        columns=module_names)
                Y['embeddings3'].append(
                    torch.cat(
                        [primary_caps_out[i] for i in range(x_orig.size(0))],
                        dim=0).detach().cpu().numpy())
                primary_caps_out = primary_caps_out.view(
                    primary_caps_out.size(0),
                    primary_caps_out.size(1) * primary_caps_out.size(2))
                Y['embeddings'].append(embedding.detach().cpu().numpy())
                Y['embeddings2'].append(
                    primary_caps_out.detach().cpu().numpy())
                loss, margin_loss, recon_loss = capsnet.calculate_loss(
                    x_orig, x_hat, y_pred, y_true)
                val_loss = margin_loss.item()  #print(loss)
                running_loss = running_loss + np.array(
                    [loss.item(), margin_loss,
                     recon_loss.item()])
                Y['true'].extend(
                    y_true.argmax(1).detach().cpu().numpy().tolist())
                Y['pred'].extend(
                    (y_pred**2
                     ).sum(2).argmax(1).detach().cpu().numpy().tolist())
            running_loss /= (i + 1)
            Y['routing_weights'].iloc[:, :] = Y['routing_weights'].values / (
                i + 1)

        Y['pred'] = np.array(Y['pred']).astype(str)
        Y['true'] = np.array(Y['true']).astype(str)
        #np.save('results/routing_weights/routing_weights.{}.npy'.format(epoch),Y['routing_weights'])
        pickle.dump(
            Y['routing_weights'],
            open('results/routing_weights/routing_weights.{}.p'.format(epoch),
                 'wb'))
        Y['embeddings'] = pd.DataFrame(PCA(n_components=2).fit_transform(
            np.vstack(Y['embeddings'])),
                                       columns=['x', 'y'])
        Y['embeddings2'] = pd.DataFrame(PCA(n_components=2).fit_transform(
            np.vstack(Y['embeddings2'])),
                                        columns=['x', 'y'])
        #print(list(map(lambda x: x.shape,Y['embeddings3'])))
        Y['embeddings3'] = pd.DataFrame(PCA(n_components=2).fit_transform(
            np.vstack(Y['embeddings3'])),
                                        columns=['x', 'y'])  #'z'
        Y['embeddings']['color'] = Y['true']
        Y['embeddings2']['color'] = Y['true']
        Y['embeddings3']['color'] = module_names * ma_v.beta.shape[
            0]  #Y['true']
        Y['embeddings3']['name'] = list(
            reduce(lambda x, y: x + y, [[i] * n_primary for i in Y['true']]))
        fig = px.scatter(Y['embeddings3'],
                         x="x",
                         y="y",
                         color="color",
                         symbol='name')  #, text='name')
        py.plot(fig,
                filename='figures/embeddings3/embeddings3.{}.pos.html'.format(
                    epoch),
                auto_open=False)
        #Y['embeddings3']['color']=list(reduce(lambda x,y:x+y,[[i]*n_primary for i in Y['true']]))
        fig = px.scatter(Y['embeddings3'], x="x", y="y",
                         color="name")  #, text='color')
        py.plot(fig,
                filename='figures/embeddings3/embeddings3.{}.true.html'.format(
                    epoch),
                auto_open=False)
        fig = px.scatter(Y['embeddings'], x="x", y="y", color="color")
        py.plot(fig,
                filename='figures/embeddings/embeddings.{}.true.html'.format(
                    epoch),
                auto_open=False)
        fig = px.scatter(Y['embeddings2'], x="x", y="y", color="color")
        py.plot(fig,
                filename='figures/embeddings2/embeddings2.{}.true.html'.format(
                    epoch),
                auto_open=False)
        Y['embeddings'].loc[:, 'color'] = Y['pred']
        Y['embeddings2'].loc[:, 'color'] = Y['pred']
        fig = px.scatter(Y['embeddings'], x="x", y="y", color="color")
        py.plot(fig,
                filename='figures/embeddings/embeddings.{}.pred.html'.format(
                    epoch),
                auto_open=False)
        fig = px.scatter(Y['embeddings2'], x="x", y="y", color="color")
        py.plot(fig,
                filename='figures/embeddings2/embeddings2.{}.pred.html'.format(
                    epoch),
                auto_open=False)
        print(
            'Epoch {}: Val Loss {}, Margin Loss {}, Recon Loss {}, Val R2: {}, Val MAE: {}'
            .format(
                epoch, running_loss[0], running_loss[1], running_loss[2],
                r2_score(Y['true'].astype(int), Y['pred'].astype(int)),
                mean_absolute_error(Y['true'].astype(int),
                                    Y['pred'].astype(int))))
        print(classification_report(Y['true'], Y['pred']))
Exemple #10
0
def return_spw_importances_(train_methyl_array,
                            val_methyl_array,
                            interest_col,
                            select_subtypes,
                            capsules_pickle,
                            include_last,
                            n_bins,
                            spw_config,
                            model_state_dict_pkl,
                            batch_size,
                            by_subtype=False):
    ma = MethylationArray.from_pickle(train_methyl_array)
    ma_v = MethylationArray.from_pickle(val_methyl_array)

    try:
        ma.remove_na_samples(interest_col)
        ma_v.remove_na_samples(interest_col)
    except:
        pass

    if select_subtypes:
        ma.pheno = ma.pheno.loc[ma.pheno[interest_col].isin(select_subtypes)]
        ma.beta = ma.beta.loc[ma.pheno.index]
        ma_v.pheno = ma_v.pheno.loc[ma_v.pheno[interest_col].isin(
            select_subtypes)]
        ma_v.beta = ma_v.beta.loc[ma_v.pheno.index]

    capsules_dict = torch.load(capsules_pickle)

    final_modules, modulecpgs, module_names = capsules_dict[
        'final_modules'], capsules_dict['modulecpgs'], capsules_dict[
            'module_names']

    if not include_last:
        ma.beta = ma.beta.loc[:, modulecpgs]
        ma_v.beta = ma_v.beta.loc[:, modulecpgs]

    original_interest_col = interest_col

    if n_bins:
        new_interest_col = interest_col + '_binned'
        ma.pheno.loc[:,
                     new_interest_col], bins = pd.cut(ma.pheno[interest_col],
                                                      bins=n_bins,
                                                      retbins=True)
        ma_v.pheno.loc[:,
                       new_interest_col], _ = pd.cut(ma_v.pheno[interest_col],
                                                     bins=bins,
                                                     retbins=True)
        interest_col = new_interest_col

    datasets = dict()
    datasets['train'] = MethylationDataset(
        ma,
        interest_col,
        modules=final_modules,
        module_names=module_names,
        original_interest_col=original_interest_col,
        run_spw=True)
    datasets['val'] = MethylationDataset(
        ma_v,
        interest_col,
        modules=final_modules,
        module_names=module_names,
        original_interest_col=original_interest_col,
        run_spw=True)

    y_val = datasets['val'].y_label
    y_val_uniq = np.unique(y_val)

    dataloaders = dict()
    dataloaders['train'] = DataLoader(datasets['train'],
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=8,
                                      pin_memory=True,
                                      drop_last=True)
    dataloaders['val'] = DataLoader(datasets['val'],
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=8,
                                    pin_memory=True,
                                    drop_last=False)
    n_primary = len(final_modules)

    spw_config = torch.load(spw_config)
    spw_config.pop('module_names')

    model = MethylSPWNet(**spw_config)
    model.load_state_dict(torch.load(model_state_dict_pkl))

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()

    pathway_extractor = model.pathways

    #extract_pathways = lambda modules_x:torch.cat([pathway_extractor[i](module_x) for i,module_x in enumerate(modules_x)],dim=1)

    tensor_data = dict(train=dict(X=[], y=[]), val=dict(X=[], y=[]))

    for k in tensor_data:
        for i, (batch) in enumerate(dataloaders[k]):
            x = batch[0]
            y_true = batch[-1].argmax(1)  #[-2]
            modules_x = batch[1:-1]  #2]
            if torch.cuda.is_available():
                x = x.cuda()
                modules_x = modules_x[0].cuda(
                )  #[module.cuda() for module in modules_x]
            tensor_data[k]['X'].append(
                pathway_extractor(x, modules_x).detach().cpu()
            )  #extract_pathways(modules_x).detach().cpu())
            tensor_data[k]['y'].append(y_true.flatten().view(-1, 1))
        tensor_data[k]['X'] = torch.cat(tensor_data[k]['X'], dim=0)
        tensor_data[k]['y'] = torch.cat(tensor_data[k]['y'], dim=0)
        print(tensor_data[k]['X'].size(), tensor_data[k]['y'].size())
        tensor_data[k] = TensorDataset(tensor_data[k]['X'],
                                       tensor_data[k]['y'])
        dataloaders[k] = DataLoader(tensor_data[k],
                                    batch_size=32,
                                    sampler=ImbalancedDatasetSampler(
                                        tensor_data[k]))

    model = model.output_net
    to_cuda = lambda x: x.cuda() if torch.cuda.is_available() else x
    y = np.unique(tensor_data['train'].tensors[1].numpy().flatten())
    gs = GradientShap(model)
    X_train = torch.cat(
        [next(iter(dataloaders['train']))[0] for i in range(2)], dim=0)
    if torch.cuda.is_available():
        X_train = X_train.cuda()

    #val_loader=iter(dataloaders['val'])

    def return_importances(dataloaders, X_train):
        attributions = []
        for i in range(20):
            batch = next(iter(dataloaders['val']))
            X_test = to_cuda(batch[0])
            y_test = to_cuda(batch[1].flatten())
            attributions.append(
                torch.abs(
                    gs.attribute(
                        X_test,
                        stdevs=0.03,
                        n_samples=200,
                        baselines=X_train,
                        target=y_test,
                        return_convergence_delta=False)))  #torch.tensor(y_i)
        attributions = torch.sum(torch.cat(attributions, dim=0), dim=0)
        importances = pd.DataFrame(
            pd.Series(attributions.detach().cpu().numpy(),
                      index=module_names).sort_values(ascending=False),
            columns=['importances'])
        return importances

    if by_subtype:
        importances = []
        for k in y_val_uniq:
            idx = np.where(y_val == k)[0]
            if len(idx) > 2:
                val_dataset = Subset(tensor_data['val'], idx)
                n_concat = int(np.ceil(64. / len(idx)))
                if n_concat > 1:
                    val_dataset = ConcatDataset([val_dataset] * n_concat)
                #sampler=SubsetRandomSampler(idx)
                dataloaders['val'] = DataLoader(val_dataset,
                                                batch_size=32,
                                                shuffle=True)
                df = return_importances(dataloaders, X_train)
                df['subtype'] = k
                importances.append(df)
        importances = pd.concat(importances)
    else:
        importances = return_importances(dataloaders, X_train)

    return importances
def fit_group_lasso(
        train_methyl_array='train_val_test_sets/train_methyl_array.pkl',
        val_methyl_array='train_val_test_sets/val_methyl_array.pkl',
        test_methyl_array='train_val_test_sets/test_methyl_array.pkl',
        l1_vals=np.hstack((np.arange(0.01, 1.1,
                                     0.1), np.array([10., 20., 50.,
                                                     100.]))).tolist(),
        outcome_col='disease_only',
        min_capsule_len=5,
        capsule_choice=['gene'],
        n_jobs=0,
        n_epochs=10,
        output_results_pickle='group_lasso_model.pkl',
        output_file='group_lasso_importances.csv',
        batch_size=1280,
        lr=0.0001,
        predict=False):

    # if torch.cuda.is_available():
    # 	torch.set_default_tensor_type('torch.cuda.FloatTensor')

    datasets = dict(train=train_methyl_array,
                    val=val_methyl_array,
                    test=test_methyl_array)

    # LogisticRegression = lambda ne, lr: net = NeuralNetClassifier(LogisticRegressionModel,max_epochs=ne,lr=lr,iterator_train__shuffle=True, callbacks=[EpochScoring(LASSO)])

    X = dict()
    Y = dict()
    le = LabelEncoder()
    for k in ['train', 'val', 'test']:
        datasets[k] = MethylationArray.from_pickle(datasets[k])

    capsules, cpgs, names, cpg_arr = return_final_capsules(
        datasets['train'],
        capsule_choice,
        min_capsule_len,
        None,
        None,
        0,
        '',
        '',
        return_original_capsule_assignments=True)

    cpgs = np.unique(cpgs)

    cpg2idx = dict(zip(cpgs, np.arange(len(cpgs))))

    cpg_arr.loc[:, 'cpg'] = cpg_arr.loc[:, 'cpg'].map(cpg2idx)

    capsule_sizes = [len(capsule) for capsule in capsules]

    for k in ['train', 'val', 'test']:
        X[k] = datasets[k].beta.loc[:,
                                    cpgs]  #cudf.from_pandas(datasets[k].beta)#
        X[k].loc[:, :] = beta2M(X[k].loc[:, :].values)
        Y[k] = le.fit_transform(
            datasets[k].pheno[outcome_col]) if k == 'train' else le.transform(
                datasets[k].pheno[outcome_col]
            )  #cudf.Series(, dtype = np.float32 )

    n_classes = len(np.unique(Y['train']))
    n_cpgs = len(cpgs)

    class_weights = torch.tensor(
        compute_class_weight('balanced', np.unique(Y['train']),
                             Y['train'])).float()

    if torch.cuda.is_available():
        class_weights = class_weights.cuda()

    dataloaders = {}

    for k in ['train', 'val', 'test']:
        X[k] = torch.tensor(X[k].values).float()
        Y[k] = torch.tensor(Y[k]).long()
        dataloaders[k] = DataLoader(TensorDataset(X[k], Y[k]),
                                    batch_size=min(batch_size, X[k].shape[0]),
                                    shuffle=(k == 'train'),
                                    num_workers=n_jobs)

    def get_res(logreg_model,
                dataloader=dataloaders['test'],
                return_pred=False):
        y_res = {'pred': [], 'true': []}
        for i, (x, y) in enumerate(dataloader):
            if torch.cuda.is_available():
                x = x.cuda()
                y = y.cuda()
            y_res['true'].extend(y.detach().cpu().numpy().flatten().tolist())
            y_res['pred'].extend(
                logreg_model(x).argmax(
                    1).detach().cpu().numpy().flatten().tolist())
        y_true = le.inverse_transform(np.array(y_res['true']))
        y_pred = le.inverse_transform(np.array(y_res['pred']))
        print(classification_report(y_true, y_pred))
        if return_pred:
            return (y_true, y_pred)
        return f1_score(y_true, y_pred, average='macro')

    # @torchsnooper.snoop()
    def train_model(l1, return_model=False):
        logreg_model = GroupLasso(
            n_cpgs, n_classes, names, cpg_arr, capsule_sizes,
            l1)  #nn.Module(nn.Linear())#nn.Sequential(,nn.LogSoftmax())
        if torch.cuda.is_available():
            logreg_model = logreg_model.cuda()
            logreg_model.weights = logreg_model.weights.cuda()
            # logreg_model.groups=logreg_model.groups.cuda()
            # logreg_model.sqrt_group_sizes=logreg_model.sqrt_group_sizes.cuda()
        optimizer = optim.Adam(logreg_model.parameters(), lr=lr)
        # scheduler=
        criterion = nn.CrossEntropyLoss(weight=class_weights)

        for epoch in range(n_epochs):
            running_loss = {'train': [], 'val': []}
            for phase in ['train', 'val']:
                logreg_model.train(phase == 'train')
                for i, (x, y) in enumerate(dataloaders[phase]):
                    if torch.cuda.is_available():
                        x = x.cuda()
                        y = y.cuda()
                    optimizer.zero_grad()
                    #print(y.shape,logreg_model(x).shape)
                    y_pred = logreg_model(x)
                    loss = criterion(y_pred, y)
                    loss = loss + logreg_model.penalize()

                    #group_lasso()#group_lasso(logreg_model[0].weight)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    # 	optimizer.zero_grad()
                    # penalty=logreg_model.penalize()
                    # if phase=='train':
                    # 	penalty.backward(retain_graph=True)
                    # 	optimizer.step()
                    # loss=loss+penalty
                    item_loss = loss.item()
                    print("Epoch {}[Batch {}] - {} Loss {} ".format(
                        epoch, i, phase, item_loss),
                          flush=True)
                    running_loss[phase].append(item_loss)
                running_loss[phase] = np.mean(running_loss[phase])
            print("Epoch {} - Train Loss {} , Val Loss {}".format(
                epoch, running_loss['train'], running_loss['val']),
                  flush=True)
            if epoch and running_loss['val'] <= min_val_loss:
                min_val_loss = running_loss['val']
                best_model_weights = copy.deepcopy(logreg_model.state_dict())
            elif not epoch:
                min_val_loss = running_loss['val']
        logreg_model.load_state_dict(best_model_weights)

        if not return_model:
            return l1, get_res(logreg_model, dataloader=dataloaders['val'])
        else:
            return get_res(logreg_model,
                           dataloader=dataloaders['val'],
                           return_pred=False), logreg_model

    # pool=ProcessPool(nodes=8)
    # l1_f1=np.array(pool.map(train_model, l1_vals))
    if len(l1_vals) > 1:
        l1_f1 = np.array(
            dask.compute(*[dask.delayed(train_model)(l1) for l1 in l1_vals],
                         scheduler='threading')
        )  #np.array([train_model(l1) for l1 in l1_vals])

        l1 = l1_f1[np.argmax(l1_f1[:, 1]), 0]
    else:
        l1_f1 = None
        l1 = l1_vals[0]

    # group_lasso=GroupLasso(len(cpgs),len(np.unique(Y['train'])),names,cpg_arr,capsule_sizes,l1)

    f1, logreg_model = train_model(l1, return_model=True)
    logreg_model.train(False)

    y_true, y_pred = get_res(logreg_model, return_pred=True)

    torch.save(
        dict(model=logreg_model, l1=l1_f1, y_true=y_true, y_pred=y_pred,
             f1=f1), output_results_pickle)

    weights = logreg_model.return_weights()
    pd.DataFrame(dict(zip(names, weights)),
                 index=['importances']).T.to_csv(output_file)
def fit_logreg(train_methyl_array='train_val_test_sets/train_methyl_array.pkl',
               val_methyl_array='train_val_test_sets/val_methyl_array.pkl',
               test_methyl_array='train_val_test_sets/test_methyl_array.pkl',
               l1_vals=np.hstack(
                   (np.arange(0.01, 1.1, 0.1), np.array([10., 20., 50.,
                                                         100.]))),
               outcome_col='disease_only',
               min_capsule_len=5,
               capsule_choice=['gene'],
               n_jobs=20):

    datasets = dict(train=train_methyl_array,
                    val=val_methyl_array,
                    test=test_methyl_array)

    # LogisticRegression = lambda ne, lr: net = NeuralNetClassifier(LogisticRegressionModel,max_epochs=ne,lr=lr,iterator_train__shuffle=True, callbacks=[EpochScoring(LASSO)])

    X = dict()
    Y = dict()
    le = LabelEncoder()
    for k in ['train', 'val', 'test']:
        datasets[k] = MethylationArray.from_pickle(datasets[k])
        X[k] = datasets[k].beta  #cudf.from_pandas(datasets[k].beta)#
        X[k].loc[:, :] = beta2M(X[k].loc[:, :].values)
        Y[k] = le.fit_transform(
            datasets[k].pheno[outcome_col]) if k == 'train' else le.transform(
                datasets[k].pheno[outcome_col]
            )  #cudf.Series(, dtype = np.float32 )

    capsules, _, names = return_final_capsules(datasets['train'],
                                               capsule_choice, min_capsule_len,
                                               None, None, 0, '', '')
    # make_pipeline(CapsuleSelection(capsule,name), LogisticRegression(penalty='l1', C=1./l1,class_weight='balanced'))
    # capsules=capsules[:2]#[capsule for capsule in capsules]
    # names=names[:2]#[name for name in names]
    build_stacking_model = lambda l1: ParallelStackingClassifier(
        n_jobs=n_jobs,
        meta_classifier=LogisticRegression(penalty='l1',
                                           n_jobs=n_jobs,
                                           C=1. / l1,
                                           class_weight='balanced',
                                           solver='saga'),
        use_clones=False,
        classifiers=[
            make_pipeline(
                CapsuleSelection(capsule, name),
                LogisticRegression(penalty='l1',
                                   C=1. / l1,
                                   class_weight='balanced',
                                   solver='saga'))
            for capsule, name in zip(capsules, names) if len(capsule)
        ])

    def get_score(l1, capsules):
        print('Fitting l1: {}'.format(l1))
        score = f1_score(build_stacking_model(l1).fit(
            X['train'], Y['train'], capsules=capsules).predict(X['val']),
                         Y['val'],
                         average='macro')
        return l1, score

    scores = [get_score(l1, capsules) for l1 in l1_vals]
    # pool=ProcessPool(nodes=8)
    # scores=pool.map(lambda l1: get_score(l1,capsules), l1_vals)
    # for l1 in l1_vals:
    # 	scores.append(get_score(l1,capsules))#scores.append(dask.delayed(get_score)(l1))
    # scores.append((l1,f1_score(reg.predict(X['val']).to_pandas().values.flatten().astype(int),Y['val'].to_pandas().values.flatten().astype(int),average='macro')))
    scores = np.array(scores)  #dask.compute(*scores,scheduler='processes')
    np.save('l1_scores.npy', scores)
    l1 = scores[np.argmin(scores[:, 1]), 0]
    reg = build_stacking_model(
        l1
    )  #LogisticRegression(penalty='l1', C=1./l1)#LogisticRegression(ne,lr)#
    reg.fit(X['train'], Y['train'], capsules=capsules)
    print(
        classification_report(le.inverse_transform(Y['test']),
                              le.inverse_transform(reg.predict(X['test']))))
    # print(classification_report(le.inverse_transform(Y['test'].to_pandas().values.flatten().astype(int)),le.inverse_transform(reg.predict(X['test']).to_pandas().values.flatten().astype(int))))
    pickle.dump(dict(model=reg, features=names), open('stacked_model.pkl',
                                                      'wb'))