def main(): clinical_dir, output_dir, extra_data_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical = util.get_clinical_data(clinical_path) clinical = clinical.join(extra_data) purity_header = 'Purity_InfiniumPurify' cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv( os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def make_zscores(copy_number, clinical, gene_list): clinical_data = pd.read_csv(clinical, sep=util.get_sep_from_filename(clinical)) clinical_data = clinical_data.set_index('PATIENT_ID') relevant_clinical = clinical_data[[u'Time', u'Censor']].astype(float) relevant_clinical = relevant_clinical.dropna() df = pd.read_csv(copy_number, sep=util.get_sep_from_filename(copy_number)) df = df.drop_duplicates(subset=['Hugo_Symbol'], keep='first') df = df.dropna(subset=['Hugo_Symbol']) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Hugo_Symbol'] df_by_patient = df_by_patient[gene_list] print df_by_patient clinical_and_cnv = df_by_patient.join(relevant_clinical, how='inner') num_patients = clinical_and_cnv.shape[0] cancer_type = util.get_cancer_type(copy_number) results = [] for gene in clinical_and_cnv: if gene in ('Time', 'Censor'): # skip metadata continue if clinical_and_cnv[gene].count() > 10: num_with_copy_number = (clinical_and_cnv[gene] != 0).sum() cox_dict = analysis.do_cox(clinical_and_cnv.Time, clinical_and_cnv.Censor, clinical_and_cnv[gene], float_time=True) cox_dict['gene'] = gene results.append(cox_dict) return results
def calculate_cox_for_cancer_type(requested_data, mutation_data, outdir): cancer_type = util.get_cancer_type(mutation_data) clinical = os.path.join('.', 'clinical', cancer_type + '.clin.merged.txt') clinical_data = util.get_clinical_data(clinical) start_pos = None if cancer_type in ['COADREAD', 'OV']: folder = os.path.dirname(mutation_data) mutation_data = os.path.join(folder, 'HG36_HG37', cancer_type + '_hg36_hg37.txt') start_pos = u'hg37_start' df, clinical_with_sequenced_patients, num_patients = zscores_for_mutants.prep_data( mutation_data, clinical_data) if not start_pos: upper_columns = [i.upper() for i in df.columns] start_pos_index = upper_columns.index('START_POSITION') start_pos = df.columns[start_pos_index] patients_with_gene = df.groupby(level=u'Hugo_Symbol') output_data = [] for i, request in requested_data.iteritems(): gene = i[1:] # print gene # print request if gene in patients_with_gene.groups.keys(): patients_with_requested_gene = patients_with_gene.get_group(gene) mutated_at_positions = patients_with_requested_gene[ start_pos].isin(request) # print mutated_at_positions patients_with_requested_positions = patients_with_requested_gene[ mutated_at_positions] ids_with_requested_positions = patients_with_requested_positions.index.get_level_values( 'identifier') if len( ids_with_requested_positions ) >= MUTATION_PERCENT * clinical_with_sequenced_patients.shape[0]: analysis_data = pd.DataFrame( {'mutated': np.ones(len(ids_with_requested_positions))}, index=ids_with_requested_positions) analysis_data = analysis_data.join( clinical_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'], analysis_data['mutated']) outdict = {cancer_type + ' p': cox_dict['p']} outdict[cancer_type + ' z'] = cox_dict['z'] outdict[cancer_type + ' mutants'] = len(ids_with_requested_positions) outdict[cancer_type + ' n'] = cox_dict['n'] outdict['gene'] = i outdict['positions'] = ':'.join(request) output_data.append(outdict) outdata = pd.DataFrame(output_data) print outdata if len(outdata): outdata = outdata.set_index(['gene', 'positions']) return outdata
def calculate_cox(mutation, clinical_data, outdir, univariate_file=None): df = prep_data(mutation, clinical_data) df = df.join(clinical_data, how='inner') num_patients = len(df.index) gene_pairs = itertools.combinations(COMMONLY_MUTATED, 2) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] print cancer_type outfile = os.path.join(outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv') if univariate_file: univariate_data = pd.read_csv(univariate_file, index_col=0) outfile = os.path.join(outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.notalonesignificant.zscores.out.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene_pair in gene_pairs: gene_pair = pd.Series(gene_pair) gene_pair_str = '-'.join(gene_pair) if gene_pair.isin(df.columns.values).sum() < 2: continue paired_mutations = df[list(gene_pair)] double_mutated_patients = paired_mutations[paired_mutations.sum(axis=1) == 2].index num_mutations = len(double_mutated_patients) print gene_pair_str, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: # if we have univariate data, check to see that neither gene is significant for survival independently # before calculation if univariate_file: if univariate_data.loc[gene_pair[0]].zscore < -1.96 or univariate_data.loc[gene_pair[0]].zscore > 1.96: print 'Skipping pair', gene_pair_str, 'for gene 0' continue if univariate_data.loc[gene_pair[1]].zscore < -1.96 or univariate_data.loc[gene_pair[1]].zscore > 1.96: print 'Skipping pair', gene_pair_str, 'for gene 1' continue # analysis_data = pd.DataFrame({'mutated': np.ones(num_mutations)}, index=double_mutated_patients) analysis_data = pd.DataFrame() analysis_data['time'] = df['time'] analysis_data['censor'] = df['censor'] analysis_data['mutated'] = 0 analysis_data.loc[double_mutated_patients,'mutated'] = 1 #Do analysis! print 'Doing analysis for', gene_pair_str, 'with', num_mutations, 'double mutations', 'of', num_patients name = cancer_type+ '_' + gene_pair_str.replace('\'', '') print name cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'], analysis_data['mutated']) out.write(formatstring.format(gene_pair_str, cox_dict['z'], cox_dict['p'], num_mutations,cox_dict['n'])) analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated'])
def make_zscores(copy_number, clinical, outdir): clinical_data = pd.read_csv(clinical, sep=util.get_sep_from_filename(clinical)) clinical_data = clinical_data.set_index('PATIENT_ID') relevant_clinical = clinical_data[[u'Time', u'Censor']].astype(float) relevant_clinical = relevant_clinical.dropna() df = pd.read_csv(copy_number, sep=util.get_sep_from_filename(copy_number)) df = df.drop_duplicates(subset=['Hugo_Symbol'], keep='first') df = df.dropna(subset=['Hugo_Symbol']) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Hugo_Symbol'] clinical_and_cnv = df_by_patient.join(relevant_clinical, how='inner') num_patients = clinical_and_cnv.shape[0] cancer_type = util.get_cancer_type(copy_number) outfile = os.path.join(outdir, cancer_type + '.cbioportal_zscores.csv') formatstring = '{0}, {1}, {2}, {3}\n' with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients\n') for gene in clinical_and_cnv: if gene not in ('Time', 'Censor'): # skip metadata if clinical_and_cnv[gene].count() > 10: num_with_copy_number = (clinical_and_cnv[gene] != 0).sum() cox_dict = analysis.do_cox(clinical_and_cnv.Time, clinical_and_cnv.Censor, clinical_and_cnv[gene], float_time=True) if gene[0] != '\'': gene = '\'' + gene out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
def make_mutation_zscores(mutation, clinical, gene_list): cancer_type = util.get_cancer_type(mutation) # get mutation patients clinical_data = util.get_clinical_data(clinical) mutation = mutation_base.prep_mutation_data(mutation, clinical_data) present_gene_list = list( set(gene_list.values) & set(mutation.columns.values)) mutation_gene_list_only = mutation[present_gene_list] mutation_and_clinical = mutation_gene_list_only.join(clinical_data, how='inner') num_patients = len(mutation_and_clinical.index) results = pd.DataFrame() for gene in mutation_and_clinical: if gene in ['time', 'censor']: continue num_mutations = mutation_and_clinical[gene].sum() if num_mutations >= MUTATION_PERCENT * num_patients: cox_dict = analysis.do_cox(mutation_and_clinical.time, mutation_and_clinical.censor, mutation_and_clinical[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene cox_dict['num_mutations'] = num_mutations results = results.append(cox_dict, ignore_index=True) print results return results
def main(): copy_number_loc, clinical, outdir = get_options() cnas = os.listdir(copy_number_loc) cnas = util.remove_extraneous_files(cnas) results = pd.DataFrame() for c in cnas: cancer_type = util.get_cancer_type(c) print cancer_type clinical_file = glob.glob( os.path.join(clinical, '*' + cancer_type + '*.txt'))[0] clin = util.get_clinical_data(clinical_file) patient_breaks = count_breaks(os.path.join(copy_number_loc, c)) patient_breaks = patient_breaks.reset_index() patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample', cancer_type) patient_breaks = util.add_identifier_column(patient_breaks, 'Sample') patient_breaks = patient_breaks.set_index('identifier') patient_breaks = patient_breaks.drop('Sample', axis=1) breaks_and_clin = patient_breaks.join(clin, how='inner') breaks_and_clin.to_csv( os.path.join(outdir, cancer_type + '_breaks.csv')) cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor, breaks_and_clin.breaks) cox['cancer_type'] = cancer_type results = results.append(cox, ignore_index=True) results.to_csv(os.path.join(outdir, 'cox_results.csv'))
def make_cox(g, clinical_data, cancer_type, cutoff_percent, seq_patients, outdir): gene_codon = g['Gene-Codon'].iloc[0] by_patient = g.reset_index() by_patient = by_patient.pivot(index='level_3', columns='index', values='mutated') clinical_data = clinical_data.loc[seq_patients] num_seq_mutated = len(by_patient) if num_seq_mutated <= cutoff_percent * seq_patients.size: return None print gene_codon print num_seq_mutated, seq_patients.size, cutoff_percent * seq_patients.size print 'num patients w mut in more than 1 codon:', (by_patient.sum(axis=1) > 1).sum() by_patient['any_mut'] = by_patient.sum(axis=1) >= 1 by_patient = by_patient.join(clinical_data, how='outer') by_patient['any_mut'] = by_patient[['any_mut']].fillna(0).astype(int) by_patient = by_patient.dropna(subset=['time', 'censor'], how='any') cox_dict = analysis.do_cox(by_patient.time, by_patient.censor, by_patient.any_mut) cox_dict['cancer_type'] = cancer_type cox_dict['num_mutated_w_clinical'] = by_patient['any_mut'].sum() cox_dict['num_sequence_mutated'] = num_seq_mutated by_patient.to_csv( os.path.join( outdir, cancer_type + '_' + gene_codon + '_' + str(cutoff_percent) + 'cutoff_clinical.csv')) return pd.Series(cox_dict)
def main(): clinical_dir, output_dir, header_file = get_options() headers = pd.read_csv(header_file, index_col=0, header=None) clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) purity_header = headers.get_value(cancer_type, 1) clinical = util.get_clinical_data(clinical_path, extra_rows=[purity_header]) cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def make_cn_zscores(copy_number, clinical, interesting_genes=None, outdir='.'): clinical_data = util.get_clinical_data(clinical) cnv = pd.read_csv(copy_number, index_col=0) cnv_by_patient = cnv.transpose() cancer_type = util.get_cancer_type(copy_number) relevant_genes = '\'' + interesting_genes.index relevant_genes = list(relevant_genes) cnv = cnv_by_patient[relevant_genes] cnv = cnv.join(clinical_data, how='inner') results = [] for gene in cnv: if gene in ('time', 'censor'): # skip metadata continue if cnv[gene].count() > 10: cnv[gene + '_split'] = np.nan cnv.loc[cnv[gene] <= -0.3, gene + '_split'] = -1 cnv.loc[cnv[gene].between(-0.3, 0.3), gene + '_split'] = 0 cnv.loc[cnv[gene] >= 0.3, gene + '_split'] = 1 cox_dict = analysis.do_cox(cnv.time, cnv.censor, cnv[gene + '_split']) cox_dict['gene'] = gene cox_dict['cancer_type'] = cancer_type results.append(cox_dict) cnv.to_csv(os.path.join(outdir, cancer_type + '_trichotomized.csv')) return results
def do_single_cancer_type_cna(name, clinical, cna, outdir): cna = cna.T if 'Chromosome' in cna.columns: cna = cna.drop(['Chromosome', 'Location']) print 'Patient count for CNAs:', cna.shape cnas_and_clinical = cna.join(clinical, how='inner') num_patients = cnas_and_clinical.shape[0] print 'num patients:', num_patients formatstring = '{0}, {1}, {2}, {3}\n' outfile = os.path.join(outdir, name.replace(' ', '-') + '.cnas.out.csv') print outfile with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients\n') for gene in cnas_and_clinical: if gene in ['Time', 'Censor']: continue number_non_zero = cnas_and_clinical[cnas_and_clinical[gene] != 0][gene].shape[0] try: cox_dict = analysis.do_cox(cnas_and_clinical.Time, cnas_and_clinical.Censor, cnas_and_clinical[gene], ) out.write(formatstring.format( gene, cox_dict['z'], cox_dict['p'], cox_dict['n'])) except rpy2.rinterface.RRuntimeError as e: print 'Skipped ', gene, 'due to R error.'
def calculate_cox(data, gene): data_cox_dict = collections.defaultdict(lambda: np.nan) if data[gene].count() > 10: try: data_cox_dict = analysis.do_cox(data.time, data.censor, data[gene]) except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped', gene, 'due to R error' return data_cox_dict
def calculate_any_change_zscores(input_file): input_data = pd.read_csv(input_file, index_col=0) input_data = input_data.dropna(subset=['time', 'censor', 'copy number'], how='any') print input_data.shape input_data['any_change'] = ~np.isnan(input_data.continuous_len) any_change_zscore = analysis.do_cox(input_data.time, input_data.censor, input_data['any_change']) any_change_zscore['any_change_count'] = input_data.any_change.sum() print any_change_zscore return any_change_zscore
def calculate_broad_change_zscores(input_file): input_data = pd.read_csv(input_file, index_col=0) input_data = input_data.dropna(subset=['time', 'censor', 'copy number'], how='any') input_data['broad'] = input_data.continuous_len > FOCAL_CUTOFF # print input_data broad_zscore = analysis.do_cox(input_data.time, input_data.censor, input_data['broad']) broad_zscore['broad_count'] = input_data.broad.sum() print broad_zscore return broad_zscore
def make_zscores(data, clinical, outdir): subtype = clinical.split('.')[1] clinical_data = pd.read_csv(clinical, index_col=0, header=0) clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any') subtype_col = clinical_data.columns[-1] cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print cancer_type num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_' + subtype + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index', subtype_col): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue print 'Total:', clinical_and_data.shape[ 1] - 3 # minus time, censor, index print 'Output length:', zscore_count print 'Skipped:', zscore_skipped
def calculate_zscores_for_file(mutation_file, clinical_file, outdir, hgnc): df, clinical_data_with_sequenced_patients, num_patients = prep_data( mutation_file, clinical_file, hgnc) cancer_type = get_icgc_cancer_type(mutation_file) print cancer_type formatstring = '{0}, {1}, {2}, {3}, {4}\n' outfile = os.path.join( outdir, cancer_type + '_mutation_percent_' + str(MUTATION_PERCENT) + '.icgc_zscores.out.csv') with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') #for every gene, collect the clinical data with the mutation data. patients_with_gene = df.groupby(level=u'gene_affected') for gene, gene_df in patients_with_gene: mutated_patient_list = gene_df.index.get_level_values( 'icgc_donor_id').unique() num_mutations = len(mutated_patient_list) if num_mutations >= MUTATION_PERCENT * num_patients: # take the patients with mutations and without, and build an analysis dataframe with time and censor. analysis_data = pd.DataFrame( {'mutated': np.ones(num_mutations)}, index=mutated_patient_list) analysis_data = analysis_data.join( clinical_data_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) #Do analysis! print 'Doing analysis for %s: mutated %d of %d' % ( gene, num_mutations, num_patients) time = analysis_data['Time'] censor = analysis_data['Censor'] split = analysis_data['mutated'] cox_dict = analysis.do_cox(time, censor, split) if cox_dict['n'] != len(analysis_data['Time']): print 'ERROR' if gene[0] != '\'': gene = '\'' + gene out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n'])) analysis_data.to_csv(os.path.join(outdir, gene[1:] + '_data.csv'), columns=['Time', 'Censor', 'mutated'], index_label='patient')
def make_zscores(copy_number, clinical, outdir, metagene_file=None): clinical_data = util.get_clinical_data(clinical) df = pd.read_csv(copy_number) df = df.drop(['Chromosome', 'Location'], axis=1) df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') cancer_type = util.get_cancer_type(copy_number) if metagene_file: formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join(outdir, cancer_type + '_metagene_zscores.csv') print "Processing metagene..." metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type) print "Complete" else: outfile = os.path.join(outdir, cancer_type + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}\n' with open(outfile, 'w') as out: if metagene_file: out.write( 'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n' ) else: out.write('gene,zscore,pvalue,num patients\n') for gene in clinical_and_cnv: if gene not in ('time', 'censor'): # skip metadata if clinical_and_cnv[gene].count() > 10: if metagene_file: cox_dict = analysis.do_metagene_cox( clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene], metagene) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n'])) else: cox_dict = analysis.do_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene]) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
def calculate_broad_change_restricted_zscores(input_file): input_data = pd.read_csv(input_file, index_col=0) input_data = input_data.dropna(subset=['time', 'censor', 'copy number'], how='any') print input_data.shape # ignore patients that have a focal change input_data = input_data.drop( input_data[input_data.continuous_len <= FOCAL_CUTOFF].index) input_data['broad'] = input_data.continuous_len > FOCAL_CUTOFF broad_restricted_zscore = analysis.do_cox(input_data.time, input_data.censor, input_data['broad']) broad_restricted_zscore['broad_count'] = input_data.broad.sum() print broad_restricted_zscore return broad_restricted_zscore
def do_single_cancer_type_mutation(cancer_type, cancer_type_clinical, mutation_file, name_conversions, outdir): patients_in_both = list( set(mutation_file.columns).intersection(set( cancer_type_clinical.index))) cancer_type_mutations = mutation_file[patients_in_both].T print 'Patient count for Mutations:', cancer_type_mutations.shape cancer_type_mutations_and_clinical = cancer_type_mutations.join( cancer_type_clinical) formatstring = '{0}, {1}, {2}, {3}, {4}\n' num_patients = cancer_type_mutations_and_clinical.shape[0] outfile = os.path.join(outdir, 'mutations', cancer_type.replace(' ', '-') + '.out.csv') print outfile with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene in cancer_type_mutations_and_clinical: if gene in ['Time', 'Censor']: continue if cancer_type_mutations_and_clinical[gene].sum( ) >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox( cancer_type_mutations_and_clinical.Time, cancer_type_mutations_and_clinical.Censor, cancer_type_mutations_and_clinical[gene]) orig_gene = gene if gene in name_conversions.index: print 'Converting gene', gene, 'to', name_conversions[ 'TCGA'].loc[gene] gene = name_conversions['TCGA'].loc[gene] out.write( formatstring.format( gene, cox_dict['z'], cox_dict['p'], cancer_type_mutations_and_clinical[orig_gene].sum( ), cox_dict['n'])) cancer_type_mutations_and_clinical.to_csv( os.path.join( outdir, 'mutations/', cancer_type + '_' + gene + '_mutations.csv'), columns=['Time', 'Censor', orig_gene]) except rpy2.rinterface.RRuntimeError as e: print 'Skipped ', gene, 'due to R error.'
def main(): indir, outdir = get_options() print os.path.join(indir, '*' + 'TP53' + '*') files = glob.glob(os.path.join(indir, '*', '*' + '_TP53_data.csv')) results = [] for f in files: print f cancer_type = os.path.basename(os.path.dirname(f)) df = pd.read_csv(f, index_col=0) cox_dict = analysis.do_cox(df.time, df.censor, df.mutated) cox_dict['cancer_type'] = cancer_type results.append(cox_dict) results_df = pd.DataFrame(results) print results_df results = results_df.set_index('cancer_type') results.to_csv(os.path.join(outdir, 'tcga_p53_mutation_zscores.csv'))
def make_zscores(data, clinical, hypermutated_patients, outdir): clinical_data = util.get_clinical_data(clinical) hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients']) print 'Hypermutated in clinical file:', len(hypermutated) clinical_data = clinical_data.drop(hypermutated) cancer_type = util.get_cancer_type(data) df = mb.prep_mutation_data(data, clinical_data) print 'Remaining hypermutated:', set(df.index).intersection(hypermutated) num_patients = len(set(clinical_data.index) & set(df.index)) print 'Number of patients present in both:', num_patients clinical_and_data = df.join(clinical_data, how='inner') print 'Num patients, other count:', len(df.index) outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' zscore_count = 0 zscore_skipped = 0 with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients,num mutations\n') for gene in clinical_and_data: if gene not in ('time', 'censor', 'index'): # skip metadata num_mutations = clinical_and_data[gene].sum() # print gene, num_mutations if num_mutations >= MUTATION_PERCENT * num_patients: try: cox_dict = analysis.do_cox(clinical_and_data.time, clinical_and_data.censor, clinical_and_data[gene]) out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations)) zscore_count += 1 except rpy2.rinterface.RRuntimeError as e: print 'WARN: skipped ', gene, ' due to R error' zscore_skipped += 1 continue else: zscore_skipped += 1 continue
def do_single_cancer_type_mutation(name, clinical, mutations, outdir): mutations_and_clinical = mutations.join(clinical, how='inner') formatstring = '{0}, {1}, {2}, {3}, {4}\n' num_patients = mutations_and_clinical.shape[0] print 'Number of patients in both:', num_patients outfile = os.path.join(outdir, name.replace(' ', '-') + '.mutations.out.csv') print outfile with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene in mutations_and_clinical: if gene in ['Time', 'Censor']: continue if mutations_and_clinical[gene].sum() >= MUTATION_PERCENT*num_patients: try: cox_dict = analysis.do_cox(mutations_and_clinical.Time, mutations_and_clinical.Censor, mutations_and_clinical[gene]) out.write(formatstring.format( gene, cox_dict['z'], cox_dict['p'], mutations_and_clinical[gene].sum(), cox_dict['n'])) mutations_and_clinical.to_csv(os.path.join(outdir, 'raw_mutations/', name + '_' + gene + '_mutations.csv'), columns=['Time', 'Censor', gene]) except rpy2.rinterface.RRuntimeError as e: print 'Skipped ', gene, 'due to R error.'
def make_cnv_zscores(copy_number, clinical, gene_list): cancer_type = util.get_cancer_type(copy_number) cna = pd.read_csv(copy_number) cna_by_patient = cna.transpose() cna_by_patient.columns = cna_by_patient.loc['Symbol'] cna_by_patient_gene_list_only = cna_by_patient[gene_list] clinical_data = util.get_clinical_data(clinical) clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data, how='inner') results = pd.DataFrame() for gene in clinical_and_cnv: if gene in ['time', 'censor']: continue cox_dict = analysis.do_cox(clinical_and_cnv.time, clinical_and_cnv.censor, clinical_and_cnv[gene]) cox_dict['cancer_type'] = cancer_type cox_dict['gene'] = gene results = results.append(cox_dict, ignore_index=True) return results
def calculate_zscores_for_file(mutation_file, clinical_data, gene_list, cancer_type): df, clinical_data_with_sequenced_patients, num_patients = prep_data( mutation_file, clinical_data) df = df[df.index.get_level_values(0).isin(gene_list)] #for every gene, collect the clinical data with the mutation data. patients_with_gene = df.groupby(level=u'Hugo_Symbol') results = [] for gene, gene_df in patients_with_gene: mutated_patient_list = gene_df.index.get_level_values( 'Tumor_Sample_Barcode').unique() num_mutations = len(mutated_patient_list) # take the patients with mutations and without, and build an analysis dataframe with time and censor. analysis_data = pd.DataFrame({'mutated': np.ones(num_mutations)}, index=mutated_patient_list) analysis_data = analysis_data.join( clinical_data_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) #Do analysis! print 'Doing analysis for %s: mutated %d of %d' % (gene, num_mutations, num_patients) time = analysis_data['Time'] censor = analysis_data['Censor'] split = analysis_data['mutated'] cox_dict = analysis.do_cox(time, censor, split) cox_dict['gene'] = gene cox_dict['num_mutations'] = num_mutations if cox_dict['n'] != len(analysis_data['Time']): print 'ERROR' if gene[0] != '\'': gene = '\'' + gene results.append(cox_dict) return results
def calculate_cox(mutation, clinical_data, outdir): df = prep_data(mutation, clinical_data) df = df.join(clinical_data, how='inner') num_patients = len(df.index) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] print cancer_type outfile = os.path.join( outdir, cancer_type + '.driver_mutation_count.zscores.out.csv') print 'Missing driver genes:', set(COMMONLY_MUTATED) - set(df.columns) present_driver_genes = list( set(df.columns).intersection(set(COMMONLY_MUTATED))) print present_driver_genes driver_mutations = df[present_driver_genes] print driver_mutations driver_mutations['driver_mutation_count'] = driver_mutations.sum( axis=1, skipna=True) driver_mutations['time'] = df['time'] driver_mutations['censor'] = df['censor'] analysis_data = pd.DataFrame() analysis_data['time'] = driver_mutations['time'] analysis_data['censor'] = driver_mutations['censor'] analysis_data['driver_mutation_count'] = driver_mutations[ 'driver_mutation_count'] #Do analysis! cox_dict = analysis.do_cox(analysis_data['time'], analysis_data['censor'], analysis_data['driver_mutation_count']) with open(outfile, 'w') as out: out.write('Z: ' + str(cox_dict['z']) + ', P: ' + str(cox_dict['p']) + ', n: ' + str(cox_dict['n']) + '\n') driver_mutations.to_csv(out) return cox_dict
def do_single_cancer_type_cna(cancer_type, cancer_type_clinical, cna_file, name_conversions, outdir): print 'Duplicate count:', cancer_type_clinical.index.duplicated( keep='first').sum() cancer_type_cnas = cna_file[cancer_type_clinical.index].T print 'Patient count for CNAs:', cancer_type_cnas.shape cancer_type_cnas_and_clinical = cancer_type_cnas.join(cancer_type_clinical) print cancer_type_cnas_and_clinical.shape num_patients = cancer_type_cnas_and_clinical.shape[0] print 'num patients:', num_patients formatstring = '{0}, {1}, {2}, {3}\n' outfile = os.path.join(outdir, 'cnas', cancer_type.replace(' ', '-') + '.out.csv') print outfile with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients\n') for gene in cancer_type_cnas_and_clinical: if gene in ['Time', 'Censor']: continue number_non_zero = cancer_type_cnas_and_clinical[ cancer_type_cnas_and_clinical[gene] != 0][gene].shape[0] try: # print cancer_type_cnas_and_clinical[['Time', 'Censor', gene]] cox_dict = analysis.do_cox( cancer_type_cnas_and_clinical.Time, cancer_type_cnas_and_clinical.Censor, cancer_type_cnas_and_clinical[gene]) if gene in name_conversions.index: print 'Converting gene', gene, 'to', name_conversions[ 'TCGA'].loc[gene] gene = name_conversions['TCGA'].loc[gene] out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'])) except rpy2.rinterface.RRuntimeError as e: print 'Skipped ', gene, 'due to R error.'
def make_zscores(copy_number, clinical_data, outdir): df = pd.read_csv(copy_number, sep=',') df_by_patient = df.transpose() df_by_patient.columns = df_by_patient.loc['Symbol'] df_by_patient = df_by_patient.clip(upper=10) num_patients = df_by_patient.shape[0] clinical_and_cnv = df_by_patient.join(clinical_data, how='inner') cancer_type = util.get_cancer_type(copy_number) outfile = os.path.join(outdir, cancer_type + '_zscores.csv') formatstring = '{0}, {1}, {2}, {3}\n' with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num patients\n') for gene in clinical_and_cnv: if gene not in ('Time', 'Censor'): # skip metadata num_with_copy_number = (clinical_and_cnv[gene] != 0).sum() cox_dict = analysis.do_cox(clinical_and_cnv.Time, clinical_and_cnv.Censor, clinical_and_cnv[gene]) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
def calculate_cox(mutation, clinical, outdir, metagene_file=None, make_km=False): clinical_data = util.get_clinical_data(clinical) df = mutation_base.prep_mutation_data(mutation, clinical_data) clinical_and_data = df.join(clinical_data, how='inner') num_patients = len(clinical_and_data) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] if metagene_file: formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n' outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '_metagene_zscores.csv') print "Processing metagene..." metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type) print "Complete" else: outfile = os.path.join( outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '.zscores.out.csv') formatstring = '{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: if metagene_file: out.write( 'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n' ) else: out.write('gene,zscore,pvalue,num mutations,num patients\n') for gene in clinical_and_data: if gene in ['time', 'censor']: continue num_mutations = int(clinical_and_data[gene].sum()) if num_mutations >= MUTATION_PERCENT * num_patients: time = clinical_and_data['time'] censor = clinical_and_data['censor'] data = clinical_and_data[gene] if metagene_file: cox_dict = analysis.do_metagene_cox( time, censor, data, metagene) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['metagene-z'], cox_dict['metagene-p'], cox_dict['n'])) else: name = cancer_type + '_' + gene if make_km: analysis.do_km(name, time, censor, data, outdir) clinical_and_data['time', 'censor', gene].to_csv( os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated']) cox_dict = analysis.do_cox(time, censor, data) out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n']))
def calculate_cox(mutation, clinical_data, key, outdir): df, clinical_data_with_sequenced_patients, num_patients = prep_data( mutation, clinical_data, key) #prep output file cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0] print cancer_type outfile = os.path.join( outdir, (cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) + '_vaf_cutoff-' + str(VARIANT_ALLELE_FREQ_CUTOFF) + '.zscores.out.csv')) formatstring = '\'{0}, {1}, {2}, {3}, {4}\n' with open(outfile, 'w') as out: out.write('gene,zscore,pvalue,num mutations,num patients\n') #for every gene, collect the clinical data with the mutation data. # only for non-silent mutations patients_with_gene = df.groupby(level=u'Hugo_Symbol') for gene, gene_df in patients_with_gene: # Remove silent mutations non_silent = gene_df.where( gene_df[u'Variant_Classification'] != 'Silent') non_silent = non_silent.dropna(subset=[u'Variant_Classification']) mutated_patient_list = non_silent.index.get_level_values( 'identifier').unique() num_mutations = len(mutated_patient_list) if num_mutations >= MUTATION_PERCENT * num_patients: # Get "effectively mutated" patients: those who's VAF >= median median_vaf = non_silent['VAF'].median() greater_than_median = non_silent[ non_silent['VAF'] >= median_vaf] effectively_mutated_patients = greater_than_median.index.get_level_values( 'identifier').unique() num_effective_mutations = len(effectively_mutated_patients) # take the patients with mutations and without, and build an analysis dataframe with time and censor. analysis_data = pd.DataFrame( {'mutated': np.ones(num_effective_mutations)}, index=effectively_mutated_patients) analysis_data = analysis_data.join( clinical_data_with_sequenced_patients, how='right') analysis_data['mutated'].fillna(0, inplace=True) #Do analysis! print 'Doing analysis for ', gene, num_mutations time = analysis_data['time'] censor = analysis_data['censor'] split = analysis_data['mutated'] name = cancer_type + '_' + gene analysis.do_km(name, time, censor, split, outdir) cox_dict = analysis.do_cox(time, censor, split) if cox_dict['n'] != len(analysis_data['time']): print 'ERROR' out.write( formatstring.format(gene, cox_dict['z'], cox_dict['p'], num_mutations, cox_dict['n'])) analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'), columns=['time', 'censor', 'mutated'])