if len(common_genes) != len(genes): warnings.warn('All input genes were not found in data. The missing genes ' 'are {}'.format(missing_genes), category=Warning) # Incorporate copy number for gene activation/inactivation if copy_number: copy_loss_df = pd.read_table(copy_loss_file, index_col=0) copy_gain_df = pd.read_table(copy_gain_file, index_col=0) # Load cancer gene classification table cancer_genes = pd.read_table(cancer_gene_file) mut_subset_df = integrate_copy_number(y=mut_subset_df, cancer_genes_df=cancer_genes, genes=common_genes, loss_df=copy_loss_df, gain_df=copy_gain_df) # Add covariate info to y_matrix mut_subset_df = mut_subset_df.assign(total_status=mut_subset_df.max(axis=1)) mut_subset_df = mut_subset_df.reset_index().merge(sample_freeze_df, left_on='SAMPLE_BARCODE', right_on='SAMPLE_BARCODE')\ .set_index('SAMPLE_BARCODE') y_burden_matrix = mut_burden_df.merge(pd.DataFrame(mut_subset_df.total_status), right_index=True, left_on='SAMPLE_BARCODE')\ .set_index('SAMPLE_BARCODE') y_sub = mut_subset_df.loc[y_burden_matrix.index]['DISEASE']
if copy_number: # Load copy number matrices copy_loss_df = pd.read_table(os.path.join('data', 'copy_number_loss_status.tsv'), index_col=0) copy_gain_df = pd.read_table(os.path.join('data', 'copy_number_gain_status.tsv'), index_col=0) # Load cancer gene classification table cancer_genes = pd.read_table( os.path.join('data', 'vogelstein_cancergenes.tsv')) y = integrate_copy_number(y=y, cancer_genes_df=cancer_genes, genes=common_genes, loss_df=copy_loss_df, gain_df=copy_gain_df, include_mutation=no_mutation) # Process y matrix y = y.assign(total_status=y.max(axis=1)) y = y.reset_index().merge(sample_freeze, how='left').set_index('SAMPLE_BARCODE') count_df = y.groupby('DISEASE').sum() prop_df = count_df.divide(y['DISEASE'].value_counts(sort=False).sort_index(), axis=0) count_table = count_df.merge(prop_df, left_index=True, right_index=True) count_table.to_csv(count_table_file) # Filter diseases