def make_zscores(mutation, clinical, breaks, outdir): clinical_data = util.get_clinical_data(clinical) mut = mutation_base.prep_mutation_data(mutation, clinical_data) cancer_type = util.get_cancer_type(mutation) print cancer_type structural_breaks = pd.read_csv(breaks, index_col=0) structural_breaks = structural_breaks.astype(int) mut_and_breaks = mut.join(structural_breaks, how='inner') num_patients = len(mut_and_breaks) results = [] for gene in mut_and_breaks: if gene in ('time', 'censor', 'breaks'): # skip metadata continue num_mutations = mut_and_breaks[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time, mut_and_breaks.censor, mut_and_breaks[gene], mut_and_breaks[['breaks']]) cox_dict['gene'] = gene results.append(cox_dict) results_df = pd.DataFrame(results) results_df = results_df.set_index('gene') results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
def make_cn_zscores(copy_number, breaks, interesting_genes=None, outdir='.'): clinical_data = pd.read_csv(breaks, index_col=0) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner') cancer_type = util.get_cancer_type(copy_number) print cancer_type if interesting_genes is not None: relevant_genes = ('\'' + interesting_genes[ interesting_genes['Cancer Type'] == cancer_type]['Gene']).values relevant_genes = list(relevant_genes) + ['breaks', 'time', 'censor'] print relevant_genes clinical_and_cnv = clinical_and_cnv[relevant_genes] clinical_and_cnv.to_csv( os.path.join(outdir, cancer_type + '_interesting_genes_data.csv')) return results = [] for gene in clinical_and_cnv: if gene in ('time', 'censor'): # skip metadata continue if clinical_and_cnv[gene].count() > 10: cox_dict = analysis.do_multivariate_cox( clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene], clinical_and_cnv[['breaks']]) cox_dict['gene'] = gene results.append(cox_dict) results_df = pd.DataFrame(results) results_df = results_df.set_index('gene') results_df.to_csv(os.path.join(outdir, cancer_type + '_cn_cox.csv'))
def do_cox_models(clinical, cn_file, outdir): cn = pd.read_csv(cn_file, sep='\t', index_col=0) cn_by_patient = cn.transpose() cn_by_patient = cn_by_patient.drop(['Entrez_Gene_Id']) cn = cn_by_patient[['MYC']] data = cn.join(clinical, how='inner') analyses = { 'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'], } results = pd.DataFrame() pp = pprint.PrettyPrinter(indent=2) for g in cn: for name, a in analyses.iteritems(): cox_dict = analysis.do_multivariate_cox(data.Time, data.Censor, data[g], data[a], float_vars=True) cox_dict['gene'] = name + ' ' + g results = results.append(cox_dict, ignore_index=True) results = results.set_index('gene') results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
def do_cox_models(clinical, cn_file, mut_file, outdir): cn = pd.read_csv(cn_file) cn_by_patient = cn.transpose() cn_by_patient = cn_by_patient.drop(['Chromosome', 'Location']) cn_by_patient.columns = cn_by_patient.loc['Symbol'] cn = cn_by_patient[['\'MYC']] mut = mutation_base.prep_mutation_data(mut_file, clinical) p53_mut = mut[['\'TP53']] p53_mut.columns = ['TP53'] data = cn.join(clinical, how='inner') data = data.join(p53_mut, how='inner') analyses = { 'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'], 'CNA + P53': ['TP53', age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'] } results = pd.DataFrame() pp = pprint.PrettyPrinter(indent=2) for g in cn: for name, a in analyses.iteritems(): cox_dict = analysis.do_multivariate_cox(data.time, data.censor, data[g], data[a], float_vars=True) cox_dict['gene'] = name + ' ' + g results = results.append(cox_dict, ignore_index=True) cox_dict = analysis.do_multivariate_cox(data.time, data.censor, data['TP53'], data[analyses['CNA only']], float_vars=True) cox_dict['gene'] = 'TP53 mut' results = results.append(cox_dict, ignore_index=True) results = results.set_index('gene') results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
def calculate_cox(data, gene): try: cox_dict = analysis.do_multivariate_cox( data.time, data.censor, data[gene], #'\'' + gene.split('_')[0]], data[['TP53_mutation']], float_vars=True) return cox_dict except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped', gene, 'due to R error' return {}
def calculate_cox(mutation, clinical_data, tumor_stage_file, outdir): df = prep_data(mutation, clinical_data) df = df.join(clinical_data, how='inner') tumor_stage_data = pd.read_csv(tumor_stage_file, index_col=0) tumor_stage_cols = [i for i in tumor_stage_data if 'group' in i] df = df.join(tumor_stage_data[tumor_stage_cols], how='inner') num_patients = len(df.index) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv') header, formatstring = tumor_stage.tumor_stage_output_header_and_format( 5, tumor_stage_cols) with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients') out.write(header) out.write('\n') for gene in df: if gene in ['time', 'censor'] + tumor_stage_cols: continue num_mutations = df[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: analysis_data = pd.DataFrame() analysis_data['time'] = df['time'] analysis_data['censor'] = df['censor'] analysis_data['mutated'] = df[gene].fillna(0) analysis_data[tumor_stage_cols] = df[tumor_stage_cols] #Do analysis! cox_dict = analysis.do_multivariate_cox( analysis_data['time'], analysis_data['censor'], analysis_data['mutated'], analysis_data[tumor_stage_cols]) tumor_stage_zscores = tumor_stage.zscores_for_tumor_stage_cols( cox_dict, tumor_stage_cols) out.write( formatstring.format(gene, cox_dict['var-z'], cox_dict['var-p'], num_mutations, cox_dict['var-n'], *tumor_stage_zscores)) analysis_data.to_csv( os.path.join(outdir, gene[1:] + '_data.csv'), columns=['time', 'censor', 'mutated'] + tumor_stage_cols)
def make_zscores(copy_number, clinical_data, tumor_stage_data_dir, outdir): cancer_type = util.get_cancer_type(copy_number) df = pd.read_csv(copy_number) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') tumor_stage_data, tumor_stage_cols = tumor_stage.prep_tumor_stage_data( tumor_stage_data_dir, cancer_type) if tumor_stage_data is None: return clinical_and_cnv_and_extra = clinical_and_cnv.join( tumor_stage_data[tumor_stage_cols], how='inner') outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv') header, formatstring = tumor_stage.tumor_stage_output_header_and_format( 4, tumor_stage_cols) with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients') out.write(header) out.write('\n') for gene in clinical_and_cnv_and_extra: if gene in ['time', 'censor'] + tumor_stage_cols: # skip metadata continue if clinical_and_cnv_and_extra[gene].count() > 10: cox_dict = analysis.do_multivariate_cox( clinical_and_cnv_and_extra.time, clinical_and_cnv_and_extra.censor, clinical_and_cnv_and_extra[gene], clinical_and_cnv_and_extra[tumor_stage_cols]) group_zscores = tumor_stage.zscores_for_tumor_stage_cols( cox_dict, tumor_stage_cols) out.write( formatstring.format(gene, cox_dict['var-z'], cox_dict['var-p'], cox_dict['var-n'], *group_zscores))