Ejemplo n.º 1
0
if len(common_genes) != len(genes):
    warnings.warn('All input genes were not found in data. The missing genes '
                  'are {}'.format(missing_genes),
                  category=Warning)

# Incorporate copy number for gene activation/inactivation
if copy_number:
    copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
    copy_gain_df = pd.read_table(copy_gain_file, index_col=0)

    # Load cancer gene classification table
    cancer_genes = pd.read_table(cancer_gene_file)

    mut_subset_df = integrate_copy_number(y=mut_subset_df,
                                          cancer_genes_df=cancer_genes,
                                          genes=common_genes,
                                          loss_df=copy_loss_df,
                                          gain_df=copy_gain_df)

# Add covariate info to y_matrix
mut_subset_df = mut_subset_df.assign(total_status=mut_subset_df.max(axis=1))
mut_subset_df = mut_subset_df.reset_index().merge(sample_freeze_df,
                                                  left_on='SAMPLE_BARCODE',
                                                  right_on='SAMPLE_BARCODE')\
                                           .set_index('SAMPLE_BARCODE')
y_burden_matrix = mut_burden_df.merge(pd.DataFrame(mut_subset_df.total_status),
                                      right_index=True,
                                      left_on='SAMPLE_BARCODE')\
                               .set_index('SAMPLE_BARCODE')

y_sub = mut_subset_df.loc[y_burden_matrix.index]['DISEASE']
Ejemplo n.º 2
0
if copy_number:
    # Load copy number matrices
    copy_loss_df = pd.read_table(os.path.join('data',
                                              'copy_number_loss_status.tsv'),
                                 index_col=0)
    copy_gain_df = pd.read_table(os.path.join('data',
                                              'copy_number_gain_status.tsv'),
                                 index_col=0)

    # Load cancer gene classification table
    cancer_genes = pd.read_table(
        os.path.join('data', 'vogelstein_cancergenes.tsv'))

    y = integrate_copy_number(y=y,
                              cancer_genes_df=cancer_genes,
                              genes=common_genes,
                              loss_df=copy_loss_df,
                              gain_df=copy_gain_df,
                              include_mutation=no_mutation)

# Process y matrix
y = y.assign(total_status=y.max(axis=1))
y = y.reset_index().merge(sample_freeze,
                          how='left').set_index('SAMPLE_BARCODE')
count_df = y.groupby('DISEASE').sum()
prop_df = count_df.divide(y['DISEASE'].value_counts(sort=False).sort_index(),
                          axis=0)

count_table = count_df.merge(prop_df, left_index=True, right_index=True)
count_table.to_csv(count_table_file)

# Filter diseases