def prep_data(mutation, clinical_data, key): df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str) cancer_type = util.get_cancer_type(mutation) # remove column headers from combined mutation sheet df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique().size print 'Number of total sequenced barcodes: ', number_barcodes_in_mutation_data df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type) df['VAF'] = variant_allele_freq.calculate_vaf(df, key.loc[cancer_type]) # Reduce mutation data to patients that also have clinical data df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') df = df.join(clinical_data, on='identifier', how='inner') df.set_index([u'Hugo_Symbol', 'identifier'], inplace=True) # symmetrically filter clinical data down to patients that were also sequenced unique_patients = df.index.get_level_values('identifier').unique() unique_patients_df = pd.DataFrame(unique_patients, index=unique_patients) clinical_data_with_sequenced_patients = clinical_data.join(unique_patients_df, how='inner') num_patients = clinical_data_with_sequenced_patients.shape[0] print 'Number of patients with sequence and clinical data: ', num_patients return df, clinical_data_with_sequenced_patients, num_patients
def main(): copy_number_loc, clinical, outdir = get_options() cnas = os.listdir(copy_number_loc) cnas = util.remove_extraneous_files(cnas) results = pd.DataFrame() for c in cnas: cancer_type = util.get_cancer_type(c) print cancer_type clinical_file = glob.glob( os.path.join(clinical, '*' + cancer_type + '*.txt'))[0] clin = util.get_clinical_data(clinical_file) patient_breaks = count_breaks(os.path.join(copy_number_loc, c)) patient_breaks = patient_breaks.reset_index() patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample', cancer_type) patient_breaks = util.add_identifier_column(patient_breaks, 'Sample') patient_breaks = patient_breaks.set_index('identifier') patient_breaks = patient_breaks.drop('Sample', axis=1) breaks_and_clin = patient_breaks.join(clin, how='inner') breaks_and_clin.to_csv( os.path.join(outdir, cancer_type + '_breaks.csv')) cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor, breaks_and_clin.breaks) cox['cancer_type'] = cancer_type results = results.append(cox, ignore_index=True) results.to_csv(os.path.join(outdir, 'cox_results.csv'))
def process_input_file(input_file): df = pd.read_csv(input_file, sep='\t') df = util.maybe_clear_non_01s(df, u'Sample', input_file) df = util.add_identifier_column(df, u'Sample') patient_data = {} for index, row in df.iterrows(): identifier = row['identifier'] chromosome = row['Chromosome'] start = row['Start'] end = row['End'] copy_number = row['Segment_Mean'] # Add the new patient to the data dict, initializing the chromosome list if not identifier in patient_data: # note the length of the list is 24, so we can use chromosome number (and not worry about index 0) patient_data[identifier] = [0] * 24 # Initialize the interval tree for the new chromosome for this patient if patient_data[identifier][chromosome] == 0: patient_data[identifier][chromosome] = IntervalTree() # Add the range and copy number in this row to the correct patient_data/chromosome location # Note the interval tree implementation uses half close intervals, but copy number data # uses closed intervals, so we add 1 to the end to ensure our intervaltree matches the data. patient_data[identifier][chromosome][start:end + 1] = copy_number return patient_data
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() rnaseq = pd.read_csv(rnaseq, low_memory=False, sep='\t') rnaseq = rnaseq.drop([0]) rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float) rnaseq = rnaseq.transpose().reset_index() rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type) rnaseq = util.add_identifier_column(rnaseq, 'index') rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float) rnaseq_log2 = rnaseq_clean.apply(np.log2) rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf) rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner') mutation = mutation_base.prep_mutation_data(mutation, clinical_data) print mutation.index included_patients = set(list(mutation.index)) & set(list(rna_cnv.index)) rna_cnv = rna_cnv.loc[included_patients] rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv')) corr_dict = {} for gene in genes['Gene']: corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene']) corr_dict[cancer_type + '_' + gene] = corr return pd.DataFrame(corr_dict)
def prep_data(mutation, clinical_data): df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str) cancer_type = util.get_cancer_type(mutation) # remove column headers from combined mutation sheet df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique().size print 'Number of total sequenced barcodes: ', number_barcodes_in_mutation_data df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type) df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') # include only nonsilent mutations non_silent = df.where(df[u'Variant_Classification'] != 'Silent') df = non_silent.dropna(subset=[u'Variant_Classification']) df = df.reset_index() df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene) gene_mutation_df.index.set_names(['Hugo_Symbol', 'patient'], inplace=True) gene_mutation_df = gene_mutation_df.reset_index() gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated') return gene_patient_mutations.transpose()
def prep_data(mutation, key): df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str) cancer_type = util.get_cancer_type(mutation) # remove column headers from combined mutation sheet df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) df = df[df[u'Hugo_Symbol'].isin(COMMONLY_MUTATED)] df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique( ).size print 'Number of total sequenced barcodes: ', number_barcodes_in_mutation_data df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type) df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') # include only nonsilent mutations non_silent = df.where(df[u'Variant_Classification'] != 'Silent') df = non_silent.dropna(subset=[u'Variant_Classification']) df = df.reset_index() df['VAF'] = calculate_vaf(df, key.loc[cancer_type]) # use the largest VAF df = df.groupby(['Hugo_Symbol', 'identifier']).max() df = df.reset_index() pivoted = df.pivot(index='identifier', columns='Hugo_Symbol', values='VAF') minimum_vaf_count = MUTATION_PERCENT * number_barcodes_in_mutation_data enough_patients = pivoted.count() >= minimum_vaf_count too_few_patients = enough_patients[~enough_patients].index.values print 'Genes with too few patients:', too_few_patients pivoted = pivoted.drop(too_few_patients, axis=1) return pivoted
def get_metagene_data(metagene_file, cancer_type): rnaseq_glob = os.path.join('rnaseq', cancer_type + '*.txt') rnaseq_file = glob.glob(rnaseq_glob) assert (len(rnaseq_file) == 1) rnaseq = pd.read_csv(rnaseq_file[0], sep='\t', low_memory=False, index_col=0) metagene_list = pd.read_csv(metagene_file) metagene_df = rnaseq.loc[metagene_list['RNASeq']].astype(float) metagene_df = metagene_df.transpose().reset_index() metagene_df = util.maybe_clear_non_01s(metagene_df, 'index', cancer_type) metagene_df = util.add_identifier_column(metagene_df, 'index').drop('index', axis=1) metagene_df = metagene_df.set_index('identifier') metagene_df = metagene_df.transpose() # now we normalize. take the mean of the base 2 log, then subtract that from each row. # then take the average across genes to get the metagene value metagene_df_log2 = metagene_df.apply(np.log2) metagene_clipped = np.clip(metagene_df_log2, 0, np.inf) metagene_means = metagene_clipped.mean(axis=1) metagene_normed = metagene_clipped.sub(metagene_means, axis=0) metagene = metagene_normed.mean() metagene.name = 'metagene' return metagene
def prep_extra_data(extra_data_directory, cancer_type): extra_data_path = os.path.join(extra_data_directory, cancer_type + '.txt') extra_data = pd.read_csv(extra_data_path, sep='\t', na_values=['-']) extra_data = util.maybe_clear_non_01s(extra_data, 'SampleName', cancer_type) extra_data = util.add_identifier_column(extra_data, 'SampleName') extra_data = extra_data.drop('SampleName', axis=1) extra_data = extra_data.set_index('identifier') return extra_data
def prep_tumor_stage_data(tumor_stage_data_dir, cancer_type): tumor_stage_path = os.path.join(tumor_stage_data_dir, cancer_type + '_clinical.csv') if not os.path.isfile(tumor_stage_path): return None, None tumor_stage_data = pd.read_csv(tumor_stage_path, sep=',') tumor_stage_data = util.add_identifier_column(tumor_stage_data, 'patient_id') tumor_stage_data = tumor_stage_data.drop('patient_id', axis=1) tumor_stage_data = tumor_stage_data.set_index('identifier') tumor_stage_cols = [i for i in tumor_stage_data.columns if 'group' in i] return tumor_stage_data, tumor_stage_cols
def prep_mutation_data_alone(mutation): df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str) cancer_type = util.get_cancer_type(mutation) # remove column headers from combined mutation sheet df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique( ).size print 'Number of total sequenced barcodes: ', number_barcodes_in_mutation_data df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type) df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') return df
def count_codons_in_file(f): cancer_type = util.get_cancer_type(f) print cancer_type df = pd.read_csv(f, sep='\t', low_memory=False) # Some of the columns are named Start_position. Others are Start_Position. some are start_position. :| upper_columns = [i.upper() for i in df.columns] start_pos_index = upper_columns.index('START_POSITION') start_pos = df.columns[start_pos_index] chromosome = u'Chromosome' ncbi_builds = df[u'NCBI_Build'].value_counts() if '36' in ncbi_builds.index: print 'Using translated NCBI build', cancer_type folder = os.path.dirname(f) new_path = os.path.join(folder, 'HG36_HG37', cancer_type + '_hg36_hg37.txt') print new_path df = pd.read_csv(new_path, sep='\t', dtype=str) start_pos = u'hg37_start' chromsome = u'hg37_chr' wild_type_allele_col = u'Reference_Allele' df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df = df[df[u'Variant_Classification'].str.contains( 'Missense')] # only include missense df[u'Hugo_Symbol'] = '\'' + df[u'Hugo_Symbol'].astype(str) df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') # Some files have the same mutation from different samples listed under one patient, # we only care about the number of patients with a given mutation, so drop duplicates df = df.drop_duplicates( subset=[u'Hugo_Symbol', chromosome, start_pos, u'identifier'], keep='last') counts = df.groupby( [u'Hugo_Symbol', chromosome, start_pos, wild_type_allele_col]).size() count_df = pd.DataFrame(counts) count_df.columns = [cancer_type] count_df.index.rename( ['Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'], inplace=True) return count_df