def main(): clinical_dir, output_dir, header_file = get_options() headers = pd.read_csv(header_file, index_col=0, header=None) clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) purity_header = headers.get_value(cancer_type, 1) clinical = util.get_clinical_data(clinical_path, extra_rows=[purity_header]) cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def main(): clinical_dir, output_dir, extra_data_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical = util.get_clinical_data(clinical_path) clinical = clinical.join(extra_data) purity_header = 'Purity_InfiniumPurify' cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv( os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def calculate_cox(mutation, clinical_data, key, outdir): df, clinical_data_with_sequenced_patients, num_patients = prep_data( mutation, clinical_data, key) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] print cancer_type outfile = os.path.join( outdir, (cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '_vaf_cutoff-' + str(VARIANT_ALLELE_FREQ_CUTOFF) + '.zscores.out.csv')) formatstring = '\'{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') #for every gene, collect the clinical data with the mutation data. # only for non-silent mutations patients_with_gene = df.groupby(level=u'Hugo_Symbol') for gene, gene_df in patients_with_gene: # Remove silent mutations non_silent = gene_df.where( gene_df[u'Variant_Classification'] != 'Silent') non_silent = non_silent.dropna(subset=[u'Variant_Classification']) mutated_patient_list = non_silent.index.get_level_values( 'identifier').unique() num_mutations = len(mutated_patient_list) if num_mutations >= MUTATION_PERCENT * num_patients: # Get "effectively mutated" patients: those who's VAF >= median median_vaf = non_silent['VAF'].median() greater_than_median = non_silent[ non_silent['VAF'] >= median_vaf] effectively_mutated_patients = greater_than_median.index.get_level_values( 'identifier').unique() num_effective_mutations = len(effectively_mutated_patients) # take the patients with mutations and without, and build an analysis dataframe with time and censor. analysis_data = pd.DataFrame( {'mutated': np.ones(num_effective_mutations)}, index=effectively_mutated_patients) analysis_data = analysis_data.join( clinical_data_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) #Do analysis! print 'Doing analysis for ', gene, num_mutations time = analysis_data['time'] censor = analysis_data['censor'] split = analysis_data['mutated'] name = cancer_type + '_' + gene analysis.do_km(name, time, censor, split, outdir) cox_dict = analysis.do_cox(time, censor, split) if cox_dict['n'] != len(analysis_data['time']): print 'ERROR' out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n'])) analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated'])
def calculate_cox(mutation, clinical, outdir, metagene_file=None, make_km=False): clinical_data = util.get_clinical_data(clinical) df = mutation_base.prep_mutation_data(mutation, clinical_data) clinical_and_data = df.join(clinical_data, how='inner') num_patients = len(clinical_and_data) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] if metagene_file: formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '_metagene_zscores.csv') print "Processing metagene..." metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type) print "Complete" else: outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: if metagene_file: out.write( 'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n' ) else: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene in clinical_and_data: if gene in ['time', 'censor']: continue num_mutations = int(clinical_and_data[gene].sum()) if num_mutations >= MUTATION_PERCENT * num_patients: time = clinical_and_data['time'] censor = clinical_and_data['censor'] data = clinical_and_data[gene] if metagene_file: cox_dict = analysis.do_metagene_cox( time, censor, data, metagene) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n'])) else: name = cancer_type + '_' + gene if make_km: analysis.do_km(name, time, censor, data, outdir) clinical_and_data['time', 'censor', gene].to_csv( os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated']) cox_dict = analysis.do_cox(time, censor, data) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n']))