Exemple #1
0
rnaseq_train_df = pd.read_table(rnaseq_train, index_col=0)
rnaseq_test_df = pd.read_table(rnaseq_test, index_col=0)

# Determine most variably expressed genes and subset
if subset_mad_genes is not None:
    data_base = os.path.join('..', '0.expression-download', 'data')
    mad_file = os.path.join(data_base, '{}_mad_genes.tsv'.format(dataset))

    mad_genes_df = pd.read_table(mad_file)
    mad_genes = mad_genes_df.iloc[0:subset_mad_genes, ].gene_id.astype(str)

    rnaseq_train_df = rnaseq_train_df.reindex(mad_genes, axis='columns')
    rnaseq_test_df = rnaseq_test_df.reindex(mad_genes, axis='columns')

# Initialize DataModel class with pancancer data
dm = DataModel(df=rnaseq_train_df, test_df=rnaseq_test_df)
dm.transform(how='zeroone')

# Set seed and list of algorithms for compression
np.random.seed(1234)
random_seeds = np.random.randint(0, high=1000000, size=num_seeds)

algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae']

# Save population of models in specific folder
comp_out_dir = os.path.join(out_dir, 'ensemble_z_matrices',
                            '{}_components_{}'.format(dataset, num_components))
if not os.path.exists(comp_out_dir):
    os.makedirs(comp_out_dir)

reconstruction_results = []
expr_file = os.path.join('..', 'data', 'pancan_rnaseq_freeze.tsv.gz')
rnaseq_df = pd.read_table(expr_file, index_col=0)

# Subset x matrix to MAD genes
med_dev = pd.DataFrame(mad(rnaseq_df), index=rnaseq_df.columns)
mad_genes = (
    med_dev.sort_values(by=0, ascending=False)
    .iloc[0:num_genes_kept]
    .index
    .tolist()
)

rnaseq_df = rnaseq_df.loc[:, mad_genes]

# Initialize DataModel class with PanCanAtlas RNAseq
dm = DataModel(df=rnaseq_df)

# Transform the input matrix into a range between zero and one
dm.transform(how='zeroone')

# Fit models
dm.pca(n_components=num_components)
dm.ica(n_components=num_components)
dm.nmf(n_components=num_components)

dm.nn(n_components=num_components,
      model='tybalt',
      loss='binary_crossentropy',
      epochs=int(vae_epochs),
      batch_size=int(vae_batch_size),
      learning_rate=float(vae_lr),
# In[6]:

# Split into training and testing sets
# (For compatibility with tybalt.DataModel)
split_prop = 0.05
test_samples = random.sample(range(0, data_df.shape[0]),
                             int(data_df.shape[0] * split_prop))

test_df = data_df.iloc[test_samples, :]
train_df = data_df.drop(test_df.index, axis="index")

# In[7]:

# Initialize DataModel class with the input data
dm = DataModel(df=train_df, test_df=test_df)
dm.transform(how='zeroone')

# In[8]:

# Parameters selected to be similar to real data parameter sweep
epochs = 25
batch_size = 50
vae_learning_rate = 0.0015
dae_learning_rate = 0.0005
dae_noise = 0.01
dae_sparsity = 0

# In[9]:

# Loop over the latent dimensionalities
Exemple #4
0
def train_models(basename,
                 input_train,
                 input_test,
                 zdim,
                 paramsD,
                 out_dir,
                 num_seeds,
                 shuffle,
                 madfile,
                 num_mad_genes,
                 algorithms=['pca', 'ica', 'nmf', 'dae', 'vae']):

    # Set output directory and file names
    train_dir = os.path.join(out_dir, 'ensemble_z_results',
                             f'{zdim}_components')
    if shuffle:
        train_dir = f'{train_dir}_shuffled'

    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    if shuffle:
        file_pre = f'{basename}_{zdim}_components_shuffled_'
    else:
        file_pre = f'{basename}_{zdim}_components_'

    recon_file = os.path.join(train_dir, f'{file_pre}reconstruction.tsv')
    co_file = os.path.join(train_dir, f'{file_pre}sample_corr.tsv.gz')
    co_g_file = os.path.join(train_dir, f'{file_pre}gene_corr.tsv.gz')
    tybalt_hist_file = os.path.join(train_dir,
                                    f'{file_pre}tybalt_training_hist.tsv')
    adage_hist_file = os.path.join(train_dir,
                                   f'{file_pre}adage_training_hist.tsv')

    # Load Preprocessed Data --> could provide option to input raw data and just call process data from here,
    # but don't want to process multiple times, esp since we're running this independently on each zdim
    rnaseq_train_df = read_counts_or_params(input_train)
    rnaseq_test_df = read_counts_or_params(input_test)

    # Determine most variably expressed genes and subset
    if madfile is not None:
        mad_genes_df = read_counts_or_params(madfile)

        #data_base = os.path.join('..', '0.expression-download', 'data')
        #mad_file = os.path.join(data_base, '{}_mad_genes.tsv'.format(dataset))

        #mad_genes_df = pd.read_table(mad_file)
        mad_genes = mad_genes_df.iloc[0:num_mad_genes, ].index.astype(str)
        rnaseq_train_df = rnaseq_train_df.reindex(mad_genes, axis='columns')
        rnaseq_test_df = rnaseq_test_df.reindex(mad_genes, axis='columns')

# Initialize DataModel class

    dm = DataModel(df=rnaseq_train_df, test_df=rnaseq_test_df)
    dm.transform(
        how='zeroone'
    )  # data normalization happens here, don't need to feed in normalized data
    # Set seed and list of algorithms for compression
    np.random.seed(1234)
    random_seeds = np.random.randint(0, high=1000000, size=num_seeds)

    #algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae']

    # Save population of models in specific folder
    comp_out_dir = os.path.join(out_dir, 'ensemble_z_matrices',
                                f'{basename}_components_{zdim}')
    if not os.path.exists(comp_out_dir):
        os.makedirs(comp_out_dir)

    reconstruction_results = []
    test_reconstruction_results = []
    sample_correlation_results = []
    tybalt_training_histories = []
    adage_training_histories = []
    for seed in random_seeds:

        np.random.seed(seed)

        seed_file = os.path.join(comp_out_dir, f'model_{seed}')

        if shuffle:
            seed_file = f'{seed_file}_shuffled'

            # randomly permute genes of each sample in the rnaseq matrix
            shuf_df = rnaseq_train_df.apply(
                lambda x: np.random.permutation(x.tolist()), axis=1)

            # Setup new pandas dataframe
            shuf_df = pd.DataFrame(shuf_df, columns=['gene_list'])
            shuf_df = pd.DataFrame(shuf_df.gene_list.values.tolist(),
                                   columns=rnaseq_train_df.columns,
                                   index=rnaseq_train_df.index)

            # Initiailze a new DataModel, with different shuffling each permutation
            dm = DataModel(df=shuf_df, test_df=rnaseq_test_df)
            dm.transform(how='zeroone')

        # Fit models
        if "pca" in algorithms:
            sys.stdout.write("\nrunning pca\n")
            dm.pca(n_components=zdim, transform_test_df=True)
        if "ics" in algorithms:
            sys.stdout.write("\nrunning ics\n")
            dm.ica(n_components=zdim, transform_test_df=True)
        if "nmf" in algorithms:
            sys.stdout.write("\nrunning nmf\n")
            dm.nmf(n_components=zdim, transform_test_df=True)

        if "vae" in algorithms:
            # run tybalt vae
            sys.stdout.write("\nrunning tybalt vae\n")
            dm.nn(n_components=zdim,
                  model='tybalt',
                  loss='binary_crossentropy',
                  epochs=int(paramsD['vae_epochs']),
                  batch_size=int(paramsD['vae_batch_size']),
                  learning_rate=float(paramsD['vae_lr']),
                  separate_loss=True,
                  verbose=True,
                  transform_test_df=True)

        if "dae" in algorithms:
            # run adage dae
            sys.stdout.write("\nrunning adage dae\n")
            dm.nn(n_components=zdim,
                  model='adage',
                  loss='binary_crossentropy',
                  epochs=int(paramsD['dae_epochs']),
                  batch_size=int(paramsD['dae_batch_size']),
                  learning_rate=float(paramsD['dae_lr']),
                  noise=float(paramsD['dae_noise']),
                  sparsity=float(paramsD['dae_sparsity']),
                  verbose=True,
                  transform_test_df=True)

        # Obtain z matrix (sample scores per latent space feature) for all models
        full_z_file = f'{seed_file}_z_matrix.tsv.gz'
        dm.combine_models().to_csv(full_z_file, sep='\t', compression='gzip')

        full_test_z_file = f'{seed_file}_z_test_matrix.tsv.gz'

        sys.stdout.write("combining trained models")
        dm.combine_models(test_set=True).to_csv(full_test_z_file,
                                                sep='\t',
                                                compression='gzip')

        # Obtain weight matrices (gene by latent space feature) for all models
        full_weight_file = f'{seed_file}_weight_matrix.tsv.gz'
        dm.combine_weight_matrix().to_csv(full_weight_file,
                                          sep='\t',
                                          compression='gzip')

        # Store reconstruction costs and reconstructed input at training end

        sys.stdout.write("compiling reconstruction costs")
        full_reconstruction, reconstructed_matrices = dm.compile_reconstruction(
        )

        # Store reconstruction evaluation and data for test set
        full_test_recon, test_recon_mat = dm.compile_reconstruction(
            test_set=True)

        # Get correlations across samples and genes between input and output data
        pearson_corr = []
        spearman_corr = []
        pearson_corr_test = []
        spearman_corr_test = []

        sys.stdout.write(
            "\ncalculating correlations across samples and genes\n")

        for algorithm in algorithms:
            # Training Sample Correlations
            sys.stdout.write(
                f"training: calculating pearson correlation for {algorithm}"
            )  # f string requires py >=3.6
            pearson_corr.append(
                get_recon_correlation(df=dm.df,
                                      recon_mat_dict=reconstructed_matrices,
                                      algorithm=algorithm,
                                      cor_type='pearson',
                                      genes=False))

            sys.stdout.write(
                f"training: calculating spearman correlation for {algorithm}"
            )  # f string requires py >=3.6
            spearman_corr.append(
                get_recon_correlation(df=dm.df,
                                      recon_mat_dict=reconstructed_matrices,
                                      algorithm=algorithm,
                                      cor_type='spearman',
                                      genes=False))

            # Testing Sample Correlations
            sys.stdout.write(
                f"testing: calculating pearson correlation for {algorithm}"
            )  # f string requires py >=3.6
            pearson_corr_test.append(
                get_recon_correlation(df=dm.test_df,
                                      recon_mat_dict=test_recon_mat,
                                      algorithm=algorithm,
                                      cor_type='pearson',
                                      genes=False))
            sys.stdout.write(
                f"testing: calculating spearman correlation for {algorithm}"
            )  # f string requires py >=3.6
            spearman_corr_test.append(
                get_recon_correlation(df=dm.test_df,
                                      recon_mat_dict=test_recon_mat,
                                      algorithm=algorithm,
                                      cor_type='spearman',
                                      genes=False))

        # Training - Sample correlations between input and reconstruction
        sys.stdout.write(
            f"training: calculating sample correlation for {algorithms}"
        )  # f string requires py >=3.6
        sample_correlation_results.append(
            compile_corr_df(pearson_list=pearson_corr,
                            spearman_list=spearman_corr,
                            algorithm_list=algorithms,
                            column_names=dm.df.index,
                            seed=seed,
                            data_type='training'))

        # Testing - Sample correlations between input and reconstruction
        sys.stdout.write(
            f"testing: calculating sample correlation for {algorithms}")
        sample_correlation_results.append(
            compile_corr_df(pearson_list=pearson_corr_test,
                            spearman_list=spearman_corr_test,
                            algorithm_list=algorithms,
                            column_names=dm.test_df.index,
                            seed=seed,
                            data_type='testing'))

        # Store training histories and intermediate results for neural networks
        reconstruction_results.append(
            full_reconstruction.assign(seed=seed, shuffled=shuffle))
        test_reconstruction_results.append(
            full_test_recon.assign(seed=seed, shuffled=shuffle))
        tybalt_training_histories.append(
            dm.tybalt_fit.history_df.assign(seed=seed, shuffle=shuffle))
        adage_training_histories.append(
            dm.adage_fit.history_df.assign(seed=seed, shuffle=shuffle))


# Save reconstruction and neural network training results
    pd.concat([
        pd.concat(reconstruction_results).assign(data_type='training'),
        pd.concat(test_reconstruction_results).assign(data_type='testing')
    ]).reset_index(drop=True).to_csv(recon_file, sep='\t', index=False)
    pd.concat(sample_correlation_results).to_csv(co_file,
                                                 sep='\t',
                                                 index=False,
                                                 float_format='%.3f',
                                                 compression='gzip')
    (pd.concat(tybalt_training_histories).reset_index().rename(
        {
            'index': 'epoch'
        }, axis='columns').to_csv(tybalt_hist_file, sep='\t', index=False))
    (pd.concat(adage_training_histories).reset_index().rename(
        {
            'index': 'epoch'
        }, axis='columns').to_csv(adage_hist_file, sep='\t', index=False))
Exemple #5
0
vae_learning_rate = float(args.vae_learning_rate)
adage_learning_rate = float(args.adage_learning_rate)
adage_noise = float(args.adage_noise)
loss = args.loss
verbose = args.verbose

# Load and process data
sim_df = pd.read_table(data_file)
select_columns = range(0, sim_df.shape[1] - 1)
groups = sim_df['groups']
gene_modules = sim_df.iloc[0, range(0, sim_df.shape[1] - 1)]

sim_sub_df = sim_df.iloc[range(1, sim_df.shape[0]), :]

data_model = DataModel(df=sim_sub_df,
                       select_columns=select_columns,
                       gene_modules=gene_modules)
data_model.transform(how=how_transform)

# Real target output
real_group_data = np.array(
    pd.concat([data_model.df, data_model.other_df],
              axis=1).query('groups == "B"').drop('groups', axis=1))

# Get random seeds
random_seeds = np.random.randint(0, high=1000000, size=num_seeds)

all_results = []
for seed in random_seeds:
    data_model.pca(n_components=n_components)
    data_model.ica(n_components=n_components)
Exemple #6
0
                     '{}weight_matrix_corr_determinant.tsv'.format(file_pre))
z_det = os.path.join(out_dir,
                     '{}z_matrix_corr_determinant.tsv'.format(file_pre))
across_alg_det = os.path.join(out_dir,
                              '{}alg_corr_determinant.tsv'.format(file_pre))
tybalt_hist_file = os.path.join(out_dir,
                                '{}tybalt_training_hist.tsv'.format(file_pre))
adage_hist_file = os.path.join(out_dir,
                               '{}adage_training_hist.tsv'.format(file_pre))

# Load Data
rnaseq_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')
rnaseq_df = pd.read_table(rnaseq_file, index_col=0)

# Initialize DataModel class with pancancer data
dm = DataModel(df=rnaseq_df)

# Build models
random_seeds = np.random.randint(0, high=1000000, size=num_seeds)
algorithms = ['pca', 'ica', 'nmf', 'adage', 'tybalt']

z_matrices = []
weight_matrices = []
reconstruction_results = []
tybalt_training_histories = []
adage_training_histories = []
for seed in random_seeds:
    dm.pca(n_components=num_components)
    dm.ica(n_components=num_components)
    dm.nmf(n_components=num_components)