def copy_number_changes(cnv, clinical, outdir, cancer_type_genes): cancer_type = util.get_cancer_type(cnv) print cancer_type clinical = util.get_clinical_data(clinical) copy_numbers = pd.read_csv(cnv, index_col=0) for i, gene in cancer_type_genes.iterrows(): results = pd.DataFrame() gene_name = gene['Gene'] print gene_name gene_cnas = copy_numbers.loc[gene_name] chrom = gene_cnas['Chromosome'] gene_location = copy_numbers.loc[gene_name]['Location'] if gene['Type'] == 'Amplification': threshold_passed = gene_cnas > 0.3 else: threshold_passed = gene_cnas < -0.3 threshold_passed = threshold_passed.drop(['Chromosome', 'Location']) threshold_passed = threshold_passed[threshold_passed] copy_numbers_on_same_chrom = copy_numbers[copy_numbers['Chromosome'] == chrom] for patient in copy_numbers_on_same_chrom: if patient not in clinical.index: continue if patient in ['Chromosome', 'Location']: continue if patient in threshold_passed.index: patient_data = copy_numbers_on_same_chrom[['Location', patient]] patient_data = patient_data.reset_index().sort_values(by='Location') \ .set_index('Location').drop('Symbol') continuous, total = find_continuous_region(patient_data[patient], starting_at=gene_location, alteration_type=gene['Type']) else: continuous, total = (None, None) results[patient] = pd.Series({'continuous_len': continuous, 'chr_len': total, 'fraction': continuous/total if continuous else None, 'copy number': gene_cnas[patient], 'time': clinical.loc[patient].time, 'censor': clinical.loc[patient].censor}) results.transpose().to_csv(os.path.join(outdir, cancer_type + '_' + gene_name[1:] + '.cn_changes.csv'), columns=['time', 'censor', 'copy number', 'fraction', 'continuous_len', 'chr_len'])
def main(argv=None): cnv_dir, mutation_dir, clinical_dir, outdir, input_file = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(input_file, comment='#') print interesting_genes interesting_genes['Gene'] = '\'' + interesting_genes['Gene'] zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type] if len(cancer_type_genes) == 0: continue print cancer_type print cancer_type_genes mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0] clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] zscore_inputs.append( [cnv, mutation, clinical, outdir, cancer_type_genes]) #multiprocess_zscores([cnv, mutation, clinical, outdir, cancer_type_genes]) p = Pool(4) results = p.map(multiprocess_zscores, zscore_inputs) print results with open(os.path.join(outdir, 'cox_results.csv'), 'w') as out: formatstr = '{},{},{},{},{},{},{},{}\n' out.write( 'Cancer Type,Gene,CNA Z Score, CNA P value, Mutation Z score, Mutation P Value, Mutation Count, n\n' ) for coxs in results: cancer_type = coxs.keys()[0] print cancer_type for gene, cox_dict in coxs[cancer_type].iteritems(): print gene, cox_dict out.write( formatstr.format(cancer_type, gene, cox_dict['var-z'], cox_dict['var-p'], cox_dict[gene + '_mutations-z'], cox_dict[gene + '_mutations-p'], cox_dict['mutation_count'], cox_dict['var-n']))
def pancan_fdr(directory, files, outname): pancan_fdr = pd.DataFrame() for f in files: cancer_type = util.get_cancer_type(f).split('_')[0] print cancer_type cancer_type_df = single_zscore_file_fdr(f) cancer_type_df = cancer_type_df.add_prefix(cancer_type + ' ') pancan_fdr = pd.concat((pancan_fdr, cancer_type_df), axis=1) stouffer_fdr_df = stouffer_fdr(os.path.join(directory, 'pancan.csv')) stouffer_fdr_df = stouffer_fdr_df.add_prefix('pancan ') pancan_fdr = pd.concat((pancan_fdr, stouffer_fdr_df), axis=1, verify_integrity=True) pancan_fdr.to_csv(os.path.join(directory, outname))
def main(argv=None): mutation_dir, key_file, outdir = get_options() mutation_files = glob.glob(mutation_dir + '*txt') key = pd.read_csv(key_file, na_values=['-'], index_col=0) key = key.dropna(how='all') print key p = Pool(1) args = [] pancan = {} for mutation in mutation_files: cancer_type = util.get_cancer_type(mutation) if cancer_type in key.index: print cancer_type pancan[cancer_type] = calculate_variant_allele_distribution( cancer_type, mutation, key, outdir)
def main(): indir, outdir = get_options() clinical_files = os.listdir(indir) clinical_files = util.remove_extraneous_files(clinical_files) stage_row = 'patient.stage_event.pathologic_stage' for clinical_f in clinical_files: f = os.path.join(indir, clinical_f) cancer_type = util.get_cancer_type(clinical_f) stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type] if stage_row: clinical = util.get_clinical_data(f, extra_rows=[stage_row], extra_rows_numeric=False) clinical[stage_row] = clinical[stage_row].str.strip() print cancer_type print clinical[stage_row].value_counts()
def count_tumor_groups(clinical_file, tumor_group_file): cancer_type = util.get_cancer_type(clinical_file) stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type] if stage_row: tumor_groups = pd.read_csv(tumor_group_file) clinical = util.get_clinical_data(clinical_file, extra_rows=[stage_row], extra_rows_numeric=False) clinical[stage_row] = clinical[stage_row].str.strip() included_stages = [] for i, group in tumor_groups.iterrows(): tg = group.dropna().values if len(tg) > 0: print ', '.join(tg) + ': ', \ clinical[clinical[stage_row].isin(tg)][stage_row].count() included_stages.extend(tg) excluded_patients = clinical[~clinical[stage_row].isin(included_stages)] print 'Excluded:' print excluded_patients[stage_row].value_counts()
def count_codons(data, outdir): files = os.listdir(data) files = util.remove_extraneous_files(files) files.remove('HG36_HG37') outdata = [] ncbi_outdata = [] for f in files: file_name = os.path.join(data, f) cancer_type = util.get_cancer_type(file_name) codon_counts = count_codons_in_file(file_name) outdata.append(codon_counts) df = pd.concat(outdata, axis=1, verify_integrity=True) df['sum'] = df.sum(axis=1) df.to_csv('codon_counts.csv', index_label=[ 'Gene', 'Chromosome', 'Start Position', 'Wild Type Allele' ])
def main(argv=None): if argv is None: argv = sys.argv mutation, clinical, outdir, metagene_file, key_file = get_options(argv) key = pd.read_csv(key_file, index_col=0, na_values=['-']) key = key.dropna(how='all') print key cancer_type = util.get_cancer_type(mutation) if cancer_type in key.index: clinical_data = util.get_clinical_data(clinical) if not os.path.isdir(outdir): os.makedirs(outdir) calculate_cox(mutation, clinical_data, key, outdir, metagene_file=metagene_file)
def main(argv=None): cnv_dir, clinical_dir, outdir, input_file = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(input_file, comment='#') interesting_genes['Gene'] = '\'' + interesting_genes['Gene'] zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type] if len(cancer_type_genes) == 0: continue clinical = glob.glob(os.path.join(clinical_dir, cancer_type + '*'))[0] multiprocess_copy_number_changes([cnv, clinical, outdir, cancer_type_genes])
def make_zscores(data, clinical, outdir): subtype = clinical.split('.')[1] print clinical clinical_data = pd.read_csv(clinical, index_col=0, header=0) print clinical_data clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any') subtype_col = clinical_data.columns[-1] print subtype_col cancer_type = util.get_cancer_type(data) df = prep_data(data) print df print cancer_type print 'Number of patients present in both:', len(set(clinical_data.index) & set(df.index)) clinical_and_data = df.join(clinical_data, how='inner') outfile = os.path.join(outdir, cancer_type + '_' + subtype + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index', subtype_col): # skip metadata if clinical_and_data[gene].count() <= 10: zscore_skipped += 1 continue try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'])) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue print 'Total:', clinical_and_data.shape[1] - 3 # minus time, censor, index print 'Output length:', zscore_count print 'Skipped:', zscore_skipped
def count_codons_in_file(f): cancer_type = util.get_cancer_type(f) print cancer_type df = pd.read_csv(f, sep='\t', low_memory=False) # Some of the columns are named Start_position. Others are Start_Position. some are start_position. :| upper_columns = [i.upper() for i in df.columns] start_pos_index = upper_columns.index('START_POSITION') start_pos = df.columns[start_pos_index] chromosome = u'Chromosome' ncbi_builds = df[u'NCBI_Build'].value_counts() if '36' in ncbi_builds.index: print 'Using translated NCBI build', cancer_type folder = os.path.dirname(f) new_path = os.path.join(folder, 'HG36_HG37', cancer_type + '_hg36_hg37.txt') print new_path df = pd.read_csv(new_path, sep='\t', dtype=str) start_pos = u'hg37_start' chromsome = u'hg37_chr' wild_type_allele_col = u'Reference_Allele' df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip() df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')] df = df[df[u'Variant_Classification'].str.contains( 'Missense')] # only include missense df[u'Hugo_Symbol'] = '\'' + df[u'Hugo_Symbol'].astype(str) df = util.add_identifier_column(df, u'Tumor_Sample_Barcode') # Some files have the same mutation from different samples listed under one patient, # we only care about the number of patients with a given mutation, so drop duplicates df = df.drop_duplicates( subset=[u'Hugo_Symbol', chromosome, start_pos, u'identifier'], keep='last') counts = df.groupby( [u'Hugo_Symbol', chromosome, start_pos, wild_type_allele_col]).size() count_df = pd.DataFrame(counts) count_df.columns = [cancer_type] count_df.index.rename( ['Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'], inplace=True) return count_df
def main(): clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options( ) files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] cancer_type_fname = cancer_type print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': clinical_data = make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') clinical_data = save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir) cancer_type_fname = 'BRCA_HER2' cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0] cna = pd.read_csv(cna_file, header=0, index_col=0).T genes = '\'' + interesting_genes['Gene'] genes = genes.loc[cancer_type] print genes if type(genes) == str: print cna[[genes]] joined = cna[[genes]].join(clinical_data, how='outer') else: joined = cna[genes].join(clinical_data, how='outer') joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_data_dir = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) args = [] for c in clinical_files: cancer_type = util.get_cancer_type(c) print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c)) copy_number = glob.glob( os.path.join(input_directory, cancer_type + '*.csv'))[0] args.append((copy_number, clinical_data, extra_data_dir, outdir)) # make_zscores(copy_number, clinical_data, extra_data_dir, outdir) p = Pool(4) p.map(multiprocess_zscores, args)
def main(argv=None): cnv_dir, structural_breaks, interesting_genes_file, outdir = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = None if interesting_genes_file: interesting_genes = pd.read_csv(interesting_genes_file) zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) breaks = glob.glob( os.path.join(structural_breaks, '*' + cancer_type + '*'))[0] zscore_inputs.append([cnv, breaks, interesting_genes, outdir]) make_cn_zscores(cnv, breaks, interesting_genes, outdir) p = Pool(4) p.map(multiprocess_cn_zscores, zscore_inputs)
def main(argv=None): if argv is None: argv = sys.argv infile, indir, outdir = get_options() requested_data = read_requested_data(infile) files = os.listdir(indir) files.remove('.DS_Store') files.remove('HG36_HG37') output_data = [] for f in files: cancer_type = util.get_cancer_type(f) print cancer_type zscores = calculate_cox_for_cancer_type(requested_data, os.path.join(indir, f), outdir) output_data.append(zscores) df = pd.concat(output_data, axis=1) df.to_csv('scratch/zscores_by_codon_2_percent.csv')
def make_zscores(data, clinical, hypermutated_patients, outdir): clinical_data = util.get_clinical_data(clinical) hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients']) print 'Hypermutated in clinical file:', len(hypermutated) clinical_data = clinical_data.drop(hypermutated) cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print 'Remaining hypermutated:', set(df.index).intersection(hypermutated) num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index'): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue
def make_zscores(copy_number, mutation, clinical, outdir, genes): cancer_type = util.get_cancer_type(copy_number) clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) for g in genes['Gene']: if g not in mutation.columns: mutation[g] = 0 print mutation[g] mutations = mutation[genes['Gene']] # cox multivariate won't work if there's a quote in the multivar name, so remove it gene_names = [x[1:] + '_mutations' for x in genes['Gene']] mutations.columns = gene_names cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner') cox_dicts = {} for gene in gene_names: plain_gene_name = gene.split('_')[0] # little shenanigans to make the names work. CNAs still have a quote, and # mutations have a suffix clinical_gene = clinical_mutations_and_cnv[[ '\'' + plain_gene_name, gene, 'time', 'censor' ]] cox_dict = calculate_cox(clinical_gene, gene) cox_dict['mutation_count'] = clinical_gene[gene].sum() clinical_gene.to_csv( os.path.join( outdir, cancer_type + '_' + plain_gene_name + '_mutation_and_cna_data.csv')) cox_dicts[plain_gene_name] = cox_dict return cox_dicts
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_clinical_rows_file = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None) for c in clinical_files: cancer_type = util.get_cancer_type(c) extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]] print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c), extra_rows=extra_rows) print clinical_data copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0] print copy_number make_zscores(copy_number, clinical_data, outdir, extra_rows)
def all_cancer_types(copy_number_dir, clinical_dir, outdir, parallel_workers=0): copy_number_files = os.listdir(copy_number_dir) copy_number_files = util.remove_extraneous_files(copy_number_files) args = [] for c in copy_number_files: infile = os.path.join(copy_number_dir, c) cancer_type = util.get_cancer_type(infile) clinical_file = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] if parallel_workers == 0: make_zscores(infile, clinical_file, outdir) else: args.append((infile, clinical_file, outdir)) p = multiprocessing.Pool(parallel_workers) p.map(multiprocess, args)
def main(argv=None): mutation_dir, clinical_dir, structural_breaks, outdir = get_options() mut_files = os.listdir(mutation_dir) mut_files = util.remove_extraneous_files(mut_files) mut_files = [os.path.join(mutation_dir, i) for i in mut_files] zscore_inputs = [] for mut in mut_files: if '_' in mut: continue cancer_type = util.get_cancer_type(mut) print cancer_type clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] breaks = glob.glob( os.path.join(structural_breaks, '*' + cancer_type + '*'))[0] zscore_inputs.append([mut, clinical, breaks, outdir]) # make_zscores(mut, clinical, breaks, outdir) p = Pool(4) p.map(multiprocess_zscores, zscore_inputs)
def main(): clinical_dir, row_names_file, outdir = get_options() files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)
def main(): basedir, clinical_dir, hypermutated_patients, outdir = get_options() hypermutated = pd.read_csv(hypermutated_patients, header=None, names=['patients']) data_files = os.listdir(basedir) data_files = util.remove_extraneous_files(data_files) data_files_by_cancer_type = {util.get_cancer_type(f): f for f in data_files} clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) inputs = [] for clinical in clinical_files: cancer_type = clinical.split('.')[0] data_file = data_files_by_cancer_type[cancer_type] make_zscores(os.path.join(basedir, data_file), os.path.join(clinical_dir, clinical), hypermutated, outdir)
def all_cancer_types(mutation_dir, clinical_dir, outdir, metagene=None, parallel_workers=0): mutation_files = os.listdir(mutation_dir) mutation_files = util.remove_extraneous_files(mutation_files) mutation_files = [os.path.join(mutation_dir, f) for f in mutation_files] args = [] for m in mutation_files: cancer_type = util.get_cancer_type(m) clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] if parallel_workers == 0: calculate_cox(m, clinical, outdir, metagene_file=metagene) else: args.append([m, clinical, outdir, metagene]) if parallel_workers > 0: p = multiprocessing.Pool(parallel_workers) p.map(multiprocess_zscores, args)
def main(): mutation_dir, clinical_dir, outdir = get_options() mutation_files = os.listdir(mutation_dir) mutation_files = util.remove_extraneous_files(mutation_files) results = pd.DataFrame() for mut in mutation_files: if '_' in mut: continue cancer_type = util.get_cancer_type(mut) print cancer_type clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] clinical_data = pd.read_csv(clinical, index_col=0) mutation = mutation_base.prep_mutation_data( os.path.join(mutation_dir, mut), clinical_data) data = mutation[['\'TP53']].join(clinical_data, how='inner') print data wt_as = data[data['\'TP53'] == 0]['breaks'] mut_as = data[data['\'TP53'] != 0]['breaks'] wt_q = wt_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90]) mut_q = mut_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90]) statistic, p = stats.mannwhitneyu(wt_as, mut_as) wt_q['cancer_type'] = cancer_type wt_q['mut?'] = 'wt' mut_q['cancer_type'] = cancer_type mut_q['mut?'] = 'mut' wt_q['mann-whitney-p'] = p results = results.append(wt_q) results = results.append(mut_q) results = results.set_index(['cancer_type', 'mut?']) results.to_csv(os.path.join(outdir, 'breaks_and_p53_quantiles.csv'))
def make_zscores(copy_number, clinical_data, tumor_stage_data_dir, outdir): cancer_type = util.get_cancer_type(copy_number) df = pd.read_csv(copy_number) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') tumor_stage_data, tumor_stage_cols = tumor_stage.prep_tumor_stage_data( tumor_stage_data_dir, cancer_type) if tumor_stage_data is None: return clinical_and_cnv_and_extra = clinical_and_cnv.join( tumor_stage_data[tumor_stage_cols], how='inner') outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv') header, formatstring = tumor_stage.tumor_stage_output_header_and_format( 4, tumor_stage_cols) with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients') out.write(header) out.write('\n') for gene in clinical_and_cnv_and_extra: if gene in ['time', 'censor'] + tumor_stage_cols: # skip metadata continue if clinical_and_cnv_and_extra[gene].count() > 10: cox_dict = analysis.do_multivariate_cox( clinical_and_cnv_and_extra.time, clinical_and_cnv_and_extra.censor, clinical_and_cnv_and_extra[gene], clinical_and_cnv_and_extra[tumor_stage_cols]) group_zscores = tumor_stage.zscores_for_tumor_stage_cols( cox_dict, tumor_stage_cols) out.write( formatstring.format(gene, cox_dict['var-z'], cox_dict['var-p'], cox_dict['var-n'], *group_zscores))
def main(argv=None): if argv is None: argv = sys.argv infile, indir, outdir = get_options() requested_data = read_requested_data(infile) requested_data = requested_data.groupby('gene')['positions'].apply( lambda l: [item for sublist in l for item in sublist]) files = os.listdir(indir) files.remove('.DS_Store') files.remove('HG36_HG37') output_data = [] for f in files: cancer_type = util.get_cancer_type(f) print cancer_type zscores = calculate_cox_for_cancer_type(requested_data, os.path.join(indir, f), outdir) output_data.append(zscores) df = pd.concat(output_data, axis=1) df.to_csv('scratch/zscores_by_gene_hotspot.csv')
def main(argv=None): cnv_dir, clinical, interesting_genes_file, outdir = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(interesting_genes_file, index_col=0, header=None) results = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) clinical_file = glob.glob( os.path.join(clinical, '*' + cancer_type + '*'))[0] results += make_cn_zscores(cnv, clinical_file, interesting_genes, outdir) results_df = pd.DataFrame(results) results_df = results_df.set_index(['cancer_type', 'gene']) results_df.to_csv( os.path.join(outdir, 'trichotomized_copy_number_zscores.csv'))
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_data_dir = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) extra_data_col = 'Purity_InfiniumPurify' for c in clinical_files[3:]: cancer_type = util.get_cancer_type(c) print cancer_type if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical_data = util.get_clinical_data(os.path.join(clinical, c)) copy_number = glob.glob( os.path.join(input_directory, cancer_type + '*.csv'))[0] make_zscores(copy_number, clinical_data, outdir, extra_data, extra_data_col)
def make_zscores(copy_number, clinical_data, outdir, extra_clinical_rows=None): df = pd.read_csv(copy_number) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') cancer_type = util.get_cancer_type(copy_number) formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv') with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,clinical-row-zscore,clinical-row-pvalue,num patients\n') for gene in clinical_and_cnv: if gene not in ('time', 'censor'): # skip metadata if clinical_and_cnv[gene].count() > 10: cox_dict = analysis.do_metagene_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene], clinical_and_cnv[extra_clinical_rows[0]].rename('metagene')) out.write(formatstring.format( gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n']))
def make_cnv_zscores(copy_number, clinical, gene_list): cancer_type = util.get_cancer_type(copy_number) cna = pd.read_csv(copy_number) cna_by_patient = cna.transpose() cna_by_patient.columns = cna_by_patient.loc['Symbol'] cna_by_patient_gene_list_only = cna_by_patient[gene_list] clinical_data = util.get_clinical_data(clinical) clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data, how='inner') results = pd.DataFrame() for gene in clinical_and_cnv: if gene in ['time', 'censor']: continue cox_dict = analysis.do_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene results = results.append(cox_dict, ignore_index=True) return results