def main(): clinical_dir, output_dir, extra_data_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical = util.get_clinical_data(clinical_path) clinical = clinical.join(extra_data) purity_header = 'Purity_InfiniumPurify' cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv( os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def make_zscores(mutation, clinical, breaks, outdir): clinical_data = util.get_clinical_data(clinical) mut = mutation_base.prep_mutation_data(mutation, clinical_data) cancer_type = util.get_cancer_type(mutation) print cancer_type structural_breaks = pd.read_csv(breaks, index_col=0) structural_breaks = structural_breaks.astype(int) mut_and_breaks = mut.join(structural_breaks, how='inner') num_patients = len(mut_and_breaks) results = [] for gene in mut_and_breaks: if gene in ('time', 'censor', 'breaks'): # skip metadata continue num_mutations = mut_and_breaks[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time, mut_and_breaks.censor, mut_and_breaks[gene], mut_and_breaks[['breaks']]) cox_dict['gene'] = gene results.append(cox_dict) results_df = pd.DataFrame(results) results_df = results_df.set_index('gene') results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
def main(argv=None): mutation_dir, clinical_dir, outdir, tumor_stage_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(16) args = [] for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob( os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] tumor_stage = os.path.join(tumor_stage_dir, cancer_type + '_clinical.csv') if not os.path.isfile(tumor_stage): continue clinical_data = util.get_clinical_data(clinical) cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) args.append((mutation, clinical_data, tumor_stage, cancer_type_outdir)) # calculate_cox(mutation, clinical_data, tumor_stage, cancer_type_outdir) p.map(multiprocess_zscores, args)
def main(argv=None): mutation_dir, clinical_dir, outdir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(1) args = [] pancan = {} for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob( os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] clinical_data = util.get_clinical_data(clinical) #args.append((mutation, clinical_data, outdir)) pancan[cancer_type] = calculate_cox(mutation, clinical_data, outdir) #print args #p.map(multiprocess_zscores, args) pancan_df = pd.DataFrame(pancan) pancan_df = pancan_df.transpose() pancan_df.to_csv(os.path.join(outdir, 'pancan.csv'))
def make_cn_zscores(copy_number, clinical, interesting_genes=None, outdir='.'): clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() cancer_type = util.get_cancer_type(copy_number) relevant_genes = '\'' + interesting_genes.index relevant_genes = list(relevant_genes) cnv = cnv_by_patient[relevant_genes] cnv = cnv.join(clinical_data, how='inner') results = [] for gene in cnv: if gene in ('time', 'censor'): # skip metadata continue if cnv[gene].count() > 10: cnv[gene + '_split'] = np.nan cnv.loc[cnv[gene] <= -0.3, gene + '_split'] = -1 cnv.loc[cnv[gene].between(-0.3, 0.3), gene + '_split'] = 0 cnv.loc[cnv[gene] >= 0.3, gene + '_split'] = 1 cox_dict = analysis.do_cox(cnv.time, cnv.censor, cnv[gene + '_split']) cox_dict['gene'] = gene cox_dict['cancer_type'] = cancer_type results.append(cox_dict) cnv.to_csv(os.path.join(outdir, cancer_type + '_trichotomized.csv')) return results
def make_clinical_data(clinical_file, histologic_subtype_col, outdir): cancer_type = util.get_cancer_type(clinical_file) clinical = util.get_clinical_data(clinical_file, extra_rows=[histologic_subtype_col], extra_rows_numeric=False) return save_subtype_files(clinical, histologic_subtype_col, cancer_type, outdir)
def make_zscores(copy_number, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) p53_mutation = mutation['\'TP53'].rename('TP53_mutation') cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_mutations_and_cnv = clinical_and_cnv.join(p53_mutation, how='inner') cox_dicts = {} for gene in genes['Gene']: clinical_gene = clinical_mutations_and_cnv[[ gene, 'TP53_mutation', 'time', 'censor' ]] cox_dict = calculate_cox(clinical_gene, gene) cox_dict['mutation_count'] = clinical_gene['TP53_mutation'].sum() clinical_gene.to_csv( os.path.join( outdir, cancer_type + '_' + gene[1:] + '_p53_and_cna_data.csv')) cox_dicts[gene[1:]] = cox_dict return cox_dicts
def make_clinical_data(clinical_file, tumor_group_file, outdir, grade): cancer_type = util.get_cancer_type(clinical_file) if grade: row = tumor_stage_util.TUMOR_GRADE if not cancer_type in tumor_stage_util.TUMOR_GRADE_TYPES: return else: row = tumor_stage_util.TUMOR_STAGE[cancer_type] if row: tumor_groups = pd.read_csv(tumor_group_file) tumor_groups = tumor_groups.dropna(how='all') clinical = util.get_clinical_data(clinical_file, extra_rows=[row], extra_rows_numeric=False) clinical[row] = clinical[row].str.strip() included_stages = [] for i, group in tumor_groups.iterrows(): tg = group.dropna().values if len(tg) > 0: print ', '.join(tg) + ': ', \ clinical[clinical[row].isin(tg)][row].count() included_stages.extend(tg) clinical['group_' + str(i)] = np.where( clinical[row].isin(included_stages), 0, 1) clinical = clinical.drop('group_' + str(i), axis=1) clinical = clinical[clinical[row].isin(included_stages)] clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'), index_label='patient_id')
def make_clinical_data(clinical_file, clinical_variables, outdir): clinical = util.get_clinical_data(clinical_file, extra_rows=[ age_r, breslow_r, gender_r, ulceration_r, stage_r, clark_r ], extra_rows_numeric=False) gender_groups = pd.read_csv(os.path.join(clinical_variables, 'SKCM_gender.csv'), dtype=str) stage_groups = pd.read_csv(os.path.join(clinical_variables, 'SKCM_stage.csv'), dtype=str) clark_groups = pd.read_csv(os.path.join(clinical_variables, 'SKCM_clark.csv'), dtype=str) ulceration_groups = pd.read_csv(os.path.join(clinical_variables, 'SKCM_ulceration.csv'), dtype=str) clinical = tumor_stage_util.group_discontinuous_vars( clark_r, 'clark', clark_groups, clinical) clinical = tumor_stage_util.group_discontinuous_vars( gender_r, 'gender', gender_groups, clinical) clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'), index_label='patient_id') clinical[age_r] = pd.to_numeric(clinical[age_r], errors='coerce') clinical[breslow_r] = pd.to_numeric(clinical[breslow_r], errors='coerce') clinical = clinical.dropna(subset=[breslow_r]) clinical['breslow_0'] = np.where(clinical[breslow_r] <= 1, 0, 1) return clinical
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() rnaseq = pd.read_csv(rnaseq, low_memory=False, sep='\t') rnaseq = rnaseq.drop([0]) rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float) rnaseq = rnaseq.transpose().reset_index() rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type) rnaseq = util.add_identifier_column(rnaseq, 'index') rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float) rnaseq_log2 = rnaseq_clean.apply(np.log2) rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf) rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner') mutation = mutation_base.prep_mutation_data(mutation, clinical_data) print mutation.index included_patients = set(list(mutation.index)) & set(list(rna_cnv.index)) rna_cnv = rna_cnv.loc[included_patients] rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv')) corr_dict = {} for gene in genes['Gene']: corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene']) corr_dict[cancer_type + '_' + gene] = corr return pd.DataFrame(corr_dict)
def main(): copy_number_loc, clinical, outdir = get_options() cnas = os.listdir(copy_number_loc) cnas = util.remove_extraneous_files(cnas) results = pd.DataFrame() for c in cnas: cancer_type = util.get_cancer_type(c) print cancer_type clinical_file = glob.glob( os.path.join(clinical, '*' + cancer_type + '*.txt'))[0] clin = util.get_clinical_data(clinical_file) patient_breaks = count_breaks(os.path.join(copy_number_loc, c)) patient_breaks = patient_breaks.reset_index() patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample', cancer_type) patient_breaks = util.add_identifier_column(patient_breaks, 'Sample') patient_breaks = patient_breaks.set_index('identifier') patient_breaks = patient_breaks.drop('Sample', axis=1) breaks_and_clin = patient_breaks.join(clin, how='inner') breaks_and_clin.to_csv( os.path.join(outdir, cancer_type + '_breaks.csv')) cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor, breaks_and_clin.breaks) cox['cancer_type'] = cancer_type results = results.append(cox, ignore_index=True) results.to_csv(os.path.join(outdir, 'cox_results.csv'))
def make_clinical_data(clinical_file, clinical_variables, outdir): clinical = util.get_clinical_data( clinical_file, extra_rows=[age_r, er_r, pr_r, her2_r, stage_r], extra_rows_numeric=False) stage_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_stage.csv'), dtype=str) er_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_er.csv'), dtype=str) pr_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_pr.csv'), dtype=str) her2_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_her2.csv'), dtype=str) clinical = tumor_stage_util.group_discontinuous_vars( stage_r, 'stage', stage_groups, clinical) clinical = tumor_stage_util.group_discontinuous_vars( er_r, 'er', er_groups, clinical) clinical = tumor_stage_util.group_discontinuous_vars( pr_r, 'pr', pr_groups, clinical) clinical = tumor_stage_util.group_discontinuous_vars( her2_r, 'her2', her2_groups, clinical) clinical['combined_er_pr'] = np.where(clinical['er_0'] & clinical['pr_0'], 1, 0) clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'), index_label='patient_id') clinical[age_r] = pd.to_numeric(clinical[age_r], errors='coerce') return clinical
def main(argv=None): mutation_dir, clinical_dir, outdir, univariate_output = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(16) args = [] for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob(os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] univariate_file = None if univariate_output: univariate_file = glob.glob(os.path.join(univariate_output, cancer_type, cancer_type + '.zscores.out.csv'))[0] print univariate_file clinical_data = util.get_clinical_data(clinical) cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) args.append((mutation, clinical_data, cancer_type_outdir, univariate_file)) # calculate_cox(mutation, clinical_data, cancer_type_outdir, univariate_file=univariate_file) print args p.map(multiprocess_zscores, args)
def main(): clinical_dir, output_dir, header_file = get_options() headers = pd.read_csv(header_file, index_col=0, header=None) clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) purity_header = headers.get_value(cancer_type, 1) clinical = util.get_clinical_data(clinical_path, extra_rows=[purity_header]) cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def calculate_cox_for_cancer_type(requested_data, mutation_data, outdir): cancer_type = util.get_cancer_type(mutation_data) clinical = os.path.join('.', 'clinical', cancer_type + '.clin.merged.txt') clinical_data = util.get_clinical_data(clinical) start_pos = None if cancer_type in ['COADREAD', 'OV']: folder = os.path.dirname(mutation_data) mutation_data = os.path.join(folder, 'HG36_HG37', cancer_type + '_hg36_hg37.txt') start_pos = u'hg37_start' df, clinical_with_sequenced_patients, num_patients = zscores_for_mutants.prep_data( mutation_data, clinical_data) if not start_pos: upper_columns = [i.upper() for i in df.columns] start_pos_index = upper_columns.index('START_POSITION') start_pos = df.columns[start_pos_index] patients_with_gene = df.groupby(level=u'Hugo_Symbol') output_data = [] for i, request in requested_data.iteritems(): gene = i[1:] # print gene # print request if gene in patients_with_gene.groups.keys(): patients_with_requested_gene = patients_with_gene.get_group(gene) mutated_at_positions = patients_with_requested_gene[ start_pos].isin(request) # print mutated_at_positions patients_with_requested_positions = patients_with_requested_gene[ mutated_at_positions] ids_with_requested_positions = patients_with_requested_positions.index.get_level_values( 'identifier') if len( ids_with_requested_positions ) >= MUTATION_PERCENT * clinical_with_sequenced_patients.shape[0]: analysis_data = pd.DataFrame( {'mutated': np.ones(len(ids_with_requested_positions))}, index=ids_with_requested_positions) analysis_data = analysis_data.join( clinical_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'], analysis_data['mutated']) outdict = {cancer_type + ' p': cox_dict['p']} outdict[cancer_type + ' z'] = cox_dict['z'] outdict[cancer_type + ' mutants'] = len(ids_with_requested_positions) outdict[cancer_type + ' n'] = cox_dict['n'] outdict['gene'] = i outdict['positions'] = ':'.join(request) output_data.append(outdict) outdata = pd.DataFrame(output_data) print outdata if len(outdata): outdata = outdata.set_index(['gene', 'positions']) return outdata
def make_mutation_zscores(mutation, clinical, gene_list): cancer_type = util.get_cancer_type(mutation) # get mutation patients clinical_data = util.get_clinical_data(clinical) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) present_gene_list = list( set(gene_list.values) & set(mutation.columns.values)) mutation_gene_list_only = mutation[present_gene_list] mutation_and_clinical = mutation_gene_list_only.join(clinical_data, how='inner') num_patients = len(mutation_and_clinical.index) results = pd.DataFrame() for gene in mutation_and_clinical: if gene in ['time', 'censor']: continue num_mutations = mutation_and_clinical[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_cox(mutation_and_clinical.time, mutation_and_clinical.censor, mutation_and_clinical[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene cox_dict['num_mutations'] = num_mutations results = results.append(cox_dict, ignore_index=True) print results return results
def main(argv=None): if argv is None: argv = sys.argv mutation, clinical, outdir, key_file = get_options(argv) key = pd.read_csv(key_file, index_col=0, na_values=['-']) key = key.dropna(how='all') cancer_type = util.get_cancer_type(mutation) if cancer_type in key.index: clinical_data = util.get_clinical_data(clinical) if not os.path.isdir(outdir): os.makedirs(outdir) calculate_cox(mutation, clinical_data, key, outdir)
def make_zscores(copy_number, mutation, clinical, outdir): clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = prep_mutation_data(mutation, clinical_data) cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_and_mutation_patients = list( set(mutation.index).intersection(set(clinical_and_cnv.index))) clinical_and_cnv_with_mutations = clinical_and_cnv.loc[ clinical_and_mutation_patients] cancer_type = util.get_cancer_type(copy_number) outfile = os.path.join(outdir, cancer_type + '.cnv_with_mutation_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}, {5}, {6}\n' with open(outfile, 'w') as out: out.write( 'gene,mutated zscore,mutated pvalue,mutated patients,non-mutated zscore, non-mutated pvalue, non-mutated patients\n' ) for gene in clinical_and_cnv_with_mutations: if gene not in ('time', 'censor'): # skip metadata clinical_gene = clinical_and_cnv_with_mutations[[ gene, 'time', 'censor' ]] if gene in mutation: mutations_for_gene = mutation[gene].rename('mutation') with_mutation = clinical_gene.join( mutations_for_gene.dropna(), how='inner') without_mutation = clinical_gene.join( mutations_for_gene[mutations_for_gene != 1], how='inner') else: with_mutation = pd.DataFrame({gene: []}) without_mutation = clinical_gene without_mutation_cox_dict = calculate_cox( without_mutation, gene) with_mutation_cox_dict = calculate_cox(with_mutation, gene) out.write( formatstring.format(gene, with_mutation_cox_dict['z'], with_mutation_cox_dict['p'], with_mutation_cox_dict['n'], without_mutation_cox_dict['z'], without_mutation_cox_dict['p'], without_mutation_cox_dict['n']))
def make_zscores(copy_number, clinical, outdir, metagene_file=None): clinical_data = util.get_clinical_data(clinical) df = pd.read_csv(copy_number) df = df.drop(['Chromosome', 'Location'], axis=1) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') cancer_type = util.get_cancer_type(copy_number) if metagene_file: formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join(outdir, cancer_type + '_metagene_zscores.csv') print "Processing metagene..." metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type) print "Complete" else: outfile = os.path.join(outdir, cancer_type + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}\n' with open(outfile, 'w') as out: if metagene_file: out.write( 'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n' ) else: out.write('gene,zscore,pvalue,num patients\n') for gene in clinical_and_cnv: if gene not in ('time', 'censor'): # skip metadata if clinical_and_cnv[gene].count() > 10: if metagene_file: cox_dict = analysis.do_metagene_cox( clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene], metagene) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n'])) else: cox_dict = analysis.do_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene]) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
def copy_number_changes(cnv, clinical, outdir, cancer_type_genes): cancer_type = util.get_cancer_type(cnv) print cancer_type clinical = util.get_clinical_data(clinical) copy_numbers = pd.read_csv(cnv, index_col=0) for i, gene in cancer_type_genes.iterrows(): results = pd.DataFrame() gene_name = gene['Gene'] print gene_name gene_cnas = copy_numbers.loc[gene_name] chrom = gene_cnas['Chromosome'] gene_location = copy_numbers.loc[gene_name]['Location'] if gene['Type'] == 'Amplification': threshold_passed = gene_cnas > 0.3 else: threshold_passed = gene_cnas < -0.3 threshold_passed = threshold_passed.drop(['Chromosome', 'Location']) threshold_passed = threshold_passed[threshold_passed] copy_numbers_on_same_chrom = copy_numbers[copy_numbers['Chromosome'] == chrom] for patient in copy_numbers_on_same_chrom: if patient not in clinical.index: continue if patient in ['Chromosome', 'Location']: continue if patient in threshold_passed.index: patient_data = copy_numbers_on_same_chrom[['Location', patient]] patient_data = patient_data.reset_index().sort_values(by='Location') \ .set_index('Location').drop('Symbol') continuous, total = find_continuous_region(patient_data[patient], starting_at=gene_location, alteration_type=gene['Type']) else: continuous, total = (None, None) results[patient] = pd.Series({'continuous_len': continuous, 'chr_len': total, 'fraction': continuous/total if continuous else None, 'copy number': gene_cnas[patient], 'time': clinical.loc[patient].time, 'censor': clinical.loc[patient].censor}) results.transpose().to_csv(os.path.join(outdir, cancer_type + '_' + gene_name[1:] + '.cn_changes.csv'), columns=['time', 'censor', 'copy number', 'fraction', 'continuous_len', 'chr_len'])
def main(): indir, outdir = get_options() clinical_files = os.listdir(indir) clinical_files = util.remove_extraneous_files(clinical_files) stage_row = 'patient.stage_event.pathologic_stage' for clinical_f in clinical_files: f = os.path.join(indir, clinical_f) cancer_type = util.get_cancer_type(clinical_f) stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type] if stage_row: clinical = util.get_clinical_data(f, extra_rows=[stage_row], extra_rows_numeric=False) clinical[stage_row] = clinical[stage_row].str.strip() print cancer_type print clinical[stage_row].value_counts()
def count_tumor_groups(clinical_file, tumor_group_file): cancer_type = util.get_cancer_type(clinical_file) stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type] if stage_row: tumor_groups = pd.read_csv(tumor_group_file) clinical = util.get_clinical_data(clinical_file, extra_rows=[stage_row], extra_rows_numeric=False) clinical[stage_row] = clinical[stage_row].str.strip() included_stages = [] for i, group in tumor_groups.iterrows(): tg = group.dropna().values if len(tg) > 0: print ', '.join(tg) + ': ', \ clinical[clinical[stage_row].isin(tg)][stage_row].count() included_stages.extend(tg) excluded_patients = clinical[~clinical[stage_row].isin(included_stages)] print 'Excluded:' print excluded_patients[stage_row].value_counts()
def main(): clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options( ) files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] cancer_type_fname = cancer_type print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': clinical_data = make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') clinical_data = save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir) cancer_type_fname = 'BRCA_HER2' cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0] cna = pd.read_csv(cna_file, header=0, index_col=0).T genes = '\'' + interesting_genes['Gene'] genes = genes.loc[cancer_type] print genes if type(genes) == str: print cna[[genes]] joined = cna[[genes]].join(clinical_data, how='outer') else: joined = cna[genes].join(clinical_data, how='outer') joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_data_dir = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) args = [] for c in clinical_files: cancer_type = util.get_cancer_type(c) print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c)) copy_number = glob.glob( os.path.join(input_directory, cancer_type + '*.csv'))[0] args.append((copy_number, clinical_data, extra_data_dir, outdir)) # make_zscores(copy_number, clinical_data, extra_data_dir, outdir) p = Pool(4) p.map(multiprocess_zscores, args)
def make_zscores(copy_number, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) for g in genes['Gene']: if g not in mutation.columns: mutation[g] = 0 print mutation[g] mutations = mutation[genes['Gene']] # cox multivariate won't work if there's a quote in the multivar name, so remove it gene_names = [x[1:] + '_mutations' for x in genes['Gene']] mutations.columns = gene_names cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner') cox_dicts = {} for gene in gene_names: plain_gene_name = gene.split('_')[0] # little shenanigans to make the names work. CNAs still have a quote, and # mutations have a suffix clinical_gene = clinical_mutations_and_cnv[[ '\'' + plain_gene_name, gene, 'time', 'censor' ]] cox_dict = calculate_cox(clinical_gene, gene) cox_dict['mutation_count'] = clinical_gene[gene].sum() clinical_gene.to_csv( os.path.join( outdir, cancer_type + '_' + plain_gene_name + '_mutation_and_cna_data.csv')) cox_dicts[plain_gene_name] = cox_dict return cox_dicts
def make_zscores(data, clinical, hypermutated_patients, outdir): clinical_data = util.get_clinical_data(clinical) hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients']) print 'Hypermutated in clinical file:', len(hypermutated) clinical_data = clinical_data.drop(hypermutated) cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print 'Remaining hypermutated:', set(df.index).intersection(hypermutated) num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index'): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue
def main(): clinical_dir, row_names_file, outdir = get_options() files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_clinical_rows_file = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None) for c in clinical_files: cancer_type = util.get_cancer_type(c) extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]] print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c), extra_rows=extra_rows) print clinical_data copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0] print copy_number make_zscores(copy_number, clinical_data, outdir, extra_rows)
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_data_dir = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) extra_data_col = 'Purity_InfiniumPurify' for c in clinical_files[3:]: cancer_type = util.get_cancer_type(c) print cancer_type if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical_data = util.get_clinical_data(os.path.join(clinical, c)) copy_number = glob.glob( os.path.join(input_directory, cancer_type + '*.csv'))[0] make_zscores(copy_number, clinical_data, outdir, extra_data, extra_data_col)
def make_cnv_zscores(copy_number, clinical, gene_list): cancer_type = util.get_cancer_type(copy_number) cna = pd.read_csv(copy_number) cna_by_patient = cna.transpose() cna_by_patient.columns = cna_by_patient.loc['Symbol'] cna_by_patient_gene_list_only = cna_by_patient[gene_list] clinical_data = util.get_clinical_data(clinical) clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data, how='inner') results = pd.DataFrame() for gene in clinical_and_cnv: if gene in ['time', 'censor']: continue cox_dict = analysis.do_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene results = results.append(cox_dict, ignore_index=True) return results