Exemple #1
0
        # randomly permute genes of each sample in the rnaseq matrix
        shuf_df = rnaseq_train_df.apply(
            lambda x: np.random.permutation(x.tolist()), axis=1)

        # Setup new pandas dataframe
        shuf_df = pd.DataFrame(shuf_df, columns=['gene_list'])
        shuf_df = pd.DataFrame(shuf_df.gene_list.values.tolist(),
                               columns=rnaseq_train_df.columns,
                               index=rnaseq_train_df.index)

        # Initiailze a new DataModel, with different shuffling each permutation
        dm = DataModel(df=shuf_df, test_df=rnaseq_test_df)
        dm.transform(how='zeroone')

    # Fit models
    dm.pca(n_components=num_components, transform_test_df=True)
    dm.ica(n_components=num_components, transform_test_df=True)
    dm.nmf(n_components=num_components, transform_test_df=True)

    dm.nn(n_components=num_components,
          model='tybalt',
          loss='binary_crossentropy',
          epochs=int(vae_epochs),
          batch_size=int(vae_batch_size),
          learning_rate=float(vae_lr),
          separate_loss=True,
          verbose=True,
          transform_test_df=True)

    dm.nn(n_components=num_components,
          model='adage',
    med_dev.sort_values(by=0, ascending=False)
    .iloc[0:num_genes_kept]
    .index
    .tolist()
)

rnaseq_df = rnaseq_df.loc[:, mad_genes]

# Initialize DataModel class with PanCanAtlas RNAseq
dm = DataModel(df=rnaseq_df)

# Transform the input matrix into a range between zero and one
dm.transform(how='zeroone')

# Fit models
dm.pca(n_components=num_components)
dm.ica(n_components=num_components)
dm.nmf(n_components=num_components)

dm.nn(n_components=num_components,
      model='tybalt',
      loss='binary_crossentropy',
      epochs=int(vae_epochs),
      batch_size=int(vae_batch_size),
      learning_rate=float(vae_lr),
      separate_loss=False,
      verbose=False)

dm.nn(n_components=num_components,
      model='adage',
      loss='binary_crossentropy',
Exemple #3
0
def train_models(basename,
                 input_train,
                 input_test,
                 zdim,
                 paramsD,
                 out_dir,
                 num_seeds,
                 shuffle,
                 madfile,
                 num_mad_genes,
                 algorithms=['pca', 'ica', 'nmf', 'dae', 'vae']):

    # Set output directory and file names
    train_dir = os.path.join(out_dir, 'ensemble_z_results',
                             f'{zdim}_components')
    if shuffle:
        train_dir = f'{train_dir}_shuffled'

    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    if shuffle:
        file_pre = f'{basename}_{zdim}_components_shuffled_'
    else:
        file_pre = f'{basename}_{zdim}_components_'

    recon_file = os.path.join(train_dir, f'{file_pre}reconstruction.tsv')
    co_file = os.path.join(train_dir, f'{file_pre}sample_corr.tsv.gz')
    co_g_file = os.path.join(train_dir, f'{file_pre}gene_corr.tsv.gz')
    tybalt_hist_file = os.path.join(train_dir,
                                    f'{file_pre}tybalt_training_hist.tsv')
    adage_hist_file = os.path.join(train_dir,
                                   f'{file_pre}adage_training_hist.tsv')

    # Load Preprocessed Data --> could provide option to input raw data and just call process data from here,
    # but don't want to process multiple times, esp since we're running this independently on each zdim
    rnaseq_train_df = read_counts_or_params(input_train)
    rnaseq_test_df = read_counts_or_params(input_test)

    # Determine most variably expressed genes and subset
    if madfile is not None:
        mad_genes_df = read_counts_or_params(madfile)

        #data_base = os.path.join('..', '0.expression-download', 'data')
        #mad_file = os.path.join(data_base, '{}_mad_genes.tsv'.format(dataset))

        #mad_genes_df = pd.read_table(mad_file)
        mad_genes = mad_genes_df.iloc[0:num_mad_genes, ].index.astype(str)
        rnaseq_train_df = rnaseq_train_df.reindex(mad_genes, axis='columns')
        rnaseq_test_df = rnaseq_test_df.reindex(mad_genes, axis='columns')

# Initialize DataModel class

    dm = DataModel(df=rnaseq_train_df, test_df=rnaseq_test_df)
    dm.transform(
        how='zeroone'
    )  # data normalization happens here, don't need to feed in normalized data
    # Set seed and list of algorithms for compression
    np.random.seed(1234)
    random_seeds = np.random.randint(0, high=1000000, size=num_seeds)

    #algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae']

    # Save population of models in specific folder
    comp_out_dir = os.path.join(out_dir, 'ensemble_z_matrices',
                                f'{basename}_components_{zdim}')
    if not os.path.exists(comp_out_dir):
        os.makedirs(comp_out_dir)

    reconstruction_results = []
    test_reconstruction_results = []
    sample_correlation_results = []
    tybalt_training_histories = []
    adage_training_histories = []
    for seed in random_seeds:

        np.random.seed(seed)

        seed_file = os.path.join(comp_out_dir, f'model_{seed}')

        if shuffle:
            seed_file = f'{seed_file}_shuffled'

            # randomly permute genes of each sample in the rnaseq matrix
            shuf_df = rnaseq_train_df.apply(
                lambda x: np.random.permutation(x.tolist()), axis=1)

            # Setup new pandas dataframe
            shuf_df = pd.DataFrame(shuf_df, columns=['gene_list'])
            shuf_df = pd.DataFrame(shuf_df.gene_list.values.tolist(),
                                   columns=rnaseq_train_df.columns,
                                   index=rnaseq_train_df.index)

            # Initiailze a new DataModel, with different shuffling each permutation
            dm = DataModel(df=shuf_df, test_df=rnaseq_test_df)
            dm.transform(how='zeroone')

        # Fit models
        if "pca" in algorithms:
            sys.stdout.write("\nrunning pca\n")
            dm.pca(n_components=zdim, transform_test_df=True)
        if "ics" in algorithms:
            sys.stdout.write("\nrunning ics\n")
            dm.ica(n_components=zdim, transform_test_df=True)
        if "nmf" in algorithms:
            sys.stdout.write("\nrunning nmf\n")
            dm.nmf(n_components=zdim, transform_test_df=True)

        if "vae" in algorithms:
            # run tybalt vae
            sys.stdout.write("\nrunning tybalt vae\n")
            dm.nn(n_components=zdim,
                  model='tybalt',
                  loss='binary_crossentropy',
                  epochs=int(paramsD['vae_epochs']),
                  batch_size=int(paramsD['vae_batch_size']),
                  learning_rate=float(paramsD['vae_lr']),
                  separate_loss=True,
                  verbose=True,
                  transform_test_df=True)

        if "dae" in algorithms:
            # run adage dae
            sys.stdout.write("\nrunning adage dae\n")
            dm.nn(n_components=zdim,
                  model='adage',
                  loss='binary_crossentropy',
                  epochs=int(paramsD['dae_epochs']),
                  batch_size=int(paramsD['dae_batch_size']),
                  learning_rate=float(paramsD['dae_lr']),
                  noise=float(paramsD['dae_noise']),
                  sparsity=float(paramsD['dae_sparsity']),
                  verbose=True,
                  transform_test_df=True)

        # Obtain z matrix (sample scores per latent space feature) for all models
        full_z_file = f'{seed_file}_z_matrix.tsv.gz'
        dm.combine_models().to_csv(full_z_file, sep='\t', compression='gzip')

        full_test_z_file = f'{seed_file}_z_test_matrix.tsv.gz'

        sys.stdout.write("combining trained models")
        dm.combine_models(test_set=True).to_csv(full_test_z_file,
                                                sep='\t',
                                                compression='gzip')

        # Obtain weight matrices (gene by latent space feature) for all models
        full_weight_file = f'{seed_file}_weight_matrix.tsv.gz'
        dm.combine_weight_matrix().to_csv(full_weight_file,
                                          sep='\t',
                                          compression='gzip')

        # Store reconstruction costs and reconstructed input at training end

        sys.stdout.write("compiling reconstruction costs")
        full_reconstruction, reconstructed_matrices = dm.compile_reconstruction(
        )

        # Store reconstruction evaluation and data for test set
        full_test_recon, test_recon_mat = dm.compile_reconstruction(
            test_set=True)

        # Get correlations across samples and genes between input and output data
        pearson_corr = []
        spearman_corr = []
        pearson_corr_test = []
        spearman_corr_test = []

        sys.stdout.write(
            "\ncalculating correlations across samples and genes\n")

        for algorithm in algorithms:
            # Training Sample Correlations
            sys.stdout.write(
                f"training: calculating pearson correlation for {algorithm}"
            )  # f string requires py >=3.6
            pearson_corr.append(
                get_recon_correlation(df=dm.df,
                                      recon_mat_dict=reconstructed_matrices,
                                      algorithm=algorithm,
                                      cor_type='pearson',
                                      genes=False))

            sys.stdout.write(
                f"training: calculating spearman correlation for {algorithm}"
            )  # f string requires py >=3.6
            spearman_corr.append(
                get_recon_correlation(df=dm.df,
                                      recon_mat_dict=reconstructed_matrices,
                                      algorithm=algorithm,
                                      cor_type='spearman',
                                      genes=False))

            # Testing Sample Correlations
            sys.stdout.write(
                f"testing: calculating pearson correlation for {algorithm}"
            )  # f string requires py >=3.6
            pearson_corr_test.append(
                get_recon_correlation(df=dm.test_df,
                                      recon_mat_dict=test_recon_mat,
                                      algorithm=algorithm,
                                      cor_type='pearson',
                                      genes=False))
            sys.stdout.write(
                f"testing: calculating spearman correlation for {algorithm}"
            )  # f string requires py >=3.6
            spearman_corr_test.append(
                get_recon_correlation(df=dm.test_df,
                                      recon_mat_dict=test_recon_mat,
                                      algorithm=algorithm,
                                      cor_type='spearman',
                                      genes=False))

        # Training - Sample correlations between input and reconstruction
        sys.stdout.write(
            f"training: calculating sample correlation for {algorithms}"
        )  # f string requires py >=3.6
        sample_correlation_results.append(
            compile_corr_df(pearson_list=pearson_corr,
                            spearman_list=spearman_corr,
                            algorithm_list=algorithms,
                            column_names=dm.df.index,
                            seed=seed,
                            data_type='training'))

        # Testing - Sample correlations between input and reconstruction
        sys.stdout.write(
            f"testing: calculating sample correlation for {algorithms}")
        sample_correlation_results.append(
            compile_corr_df(pearson_list=pearson_corr_test,
                            spearman_list=spearman_corr_test,
                            algorithm_list=algorithms,
                            column_names=dm.test_df.index,
                            seed=seed,
                            data_type='testing'))

        # Store training histories and intermediate results for neural networks
        reconstruction_results.append(
            full_reconstruction.assign(seed=seed, shuffled=shuffle))
        test_reconstruction_results.append(
            full_test_recon.assign(seed=seed, shuffled=shuffle))
        tybalt_training_histories.append(
            dm.tybalt_fit.history_df.assign(seed=seed, shuffle=shuffle))
        adage_training_histories.append(
            dm.adage_fit.history_df.assign(seed=seed, shuffle=shuffle))


# Save reconstruction and neural network training results
    pd.concat([
        pd.concat(reconstruction_results).assign(data_type='training'),
        pd.concat(test_reconstruction_results).assign(data_type='testing')
    ]).reset_index(drop=True).to_csv(recon_file, sep='\t', index=False)
    pd.concat(sample_correlation_results).to_csv(co_file,
                                                 sep='\t',
                                                 index=False,
                                                 float_format='%.3f',
                                                 compression='gzip')
    (pd.concat(tybalt_training_histories).reset_index().rename(
        {
            'index': 'epoch'
        }, axis='columns').to_csv(tybalt_hist_file, sep='\t', index=False))
    (pd.concat(adage_training_histories).reset_index().rename(
        {
            'index': 'epoch'
        }, axis='columns').to_csv(adage_hist_file, sep='\t', index=False))
Exemple #4
0
        # Retrieve optimized parameters for neural network models
        vae_epochs = param_df.loc['vae_epochs', str(k)]
        dae_epochs = param_df.loc['dae_epochs', str(k)]
        vae_lr = param_df.loc['vae_lr', str(k)]
        dae_lr = param_df.loc['dae_lr', str(k)]
        vae_batch_size = param_df.loc['vae_batch_size', str(k)]
        dae_batch_size = param_df.loc['dae_batch_size', str(k)]
        dae_noise = param_df.loc['dae_noise', str(k)]
        dae_sparsity = param_df.loc['dae_sparsity', str(k)]
        vae_kappa = param_df.loc['vae_kappa', str(k)]
        
        # Fit models
        # 1) PCA
        start = time.time()
        dm.pca(n_components=k, transform_test_df=False)
        end = time.time()
        total_time = end - start
        
        result = [dataset, k, "PCA", total_time]
        time_results.append(result)
        
        # 2) ICA
        start = time.time()
        dm.ica(n_components=k, transform_test_df=False)
        end = time.time()
        total_time = end - start
        
        result = [dataset, k, "ICA", total_time]
        time_results.append(result)
        
Exemple #5
0
data_model = DataModel(df=sim_sub_df,
                       select_columns=select_columns,
                       gene_modules=gene_modules)
data_model.transform(how=how_transform)

# Real target output
real_group_data = np.array(
    pd.concat([data_model.df, data_model.other_df],
              axis=1).query('groups == "B"').drop('groups', axis=1))

# Get random seeds
random_seeds = np.random.randint(0, high=1000000, size=num_seeds)

all_results = []
for seed in random_seeds:
    data_model.pca(n_components=n_components)
    data_model.ica(n_components=n_components)
    data_model.nmf(n_components=n_components)
    data_model.nn(n_components=n_components,
                  model='tybalt',
                  loss=loss,
                  epochs=vae_epochs,
                  batch_size=batch_size,
                  learning_rate=vae_learning_rate,
                  verbose=verbose)
    data_model.nn(n_components=n_components,
                  model='adage',
                  loss=loss,
                  epochs=adage_epochs,
                  batch_size=batch_size,
                  noise=adage_noise,