def make_zscores(mutation, clinical, breaks, outdir):
    clinical_data = util.get_clinical_data(clinical)
    mut = mutation_base.prep_mutation_data(mutation, clinical_data)

    cancer_type = util.get_cancer_type(mutation)
    print cancer_type

    structural_breaks = pd.read_csv(breaks, index_col=0)
    structural_breaks = structural_breaks.astype(int)
    mut_and_breaks = mut.join(structural_breaks, how='inner')
    num_patients = len(mut_and_breaks)

    results = []
    for gene in mut_and_breaks:
        if gene in ('time', 'censor', 'breaks'):  # skip metadata
            continue
        num_mutations = mut_and_breaks[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time,
                                                    mut_and_breaks.censor,
                                                    mut_and_breaks[gene],
                                                    mut_and_breaks[['breaks']])
            cox_dict['gene'] = gene
            results.append(cox_dict)
    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('gene')
    results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
Example #2
0
def make_cn_zscores(copy_number, breaks, interesting_genes=None, outdir='.'):
    clinical_data = pd.read_csv(breaks, index_col=0)
    cnv = pd.read_csv(copy_number, index_col=0)
    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    cancer_type = util.get_cancer_type(copy_number)
    print cancer_type

    if interesting_genes is not None:
        relevant_genes = ('\'' + interesting_genes[
            interesting_genes['Cancer Type'] == cancer_type]['Gene']).values
        relevant_genes = list(relevant_genes) + ['breaks', 'time', 'censor']
        print relevant_genes
        clinical_and_cnv = clinical_and_cnv[relevant_genes]
        clinical_and_cnv.to_csv(
            os.path.join(outdir, cancer_type + '_interesting_genes_data.csv'))
        return

    results = []
    for gene in clinical_and_cnv:
        if gene in ('time', 'censor'):  # skip metadata
            continue
        if clinical_and_cnv[gene].count() > 10:
            cox_dict = analysis.do_multivariate_cox(
                clinical_and_cnv.time, clinical_and_cnv.censor,
                clinical_and_cnv[gene], clinical_and_cnv[['breaks']])
            cox_dict['gene'] = gene
            results.append(cox_dict)
    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('gene')
    results_df.to_csv(os.path.join(outdir, cancer_type + '_cn_cox.csv'))
Example #3
0
def do_cox_models(clinical, cn_file, outdir):
    cn = pd.read_csv(cn_file, sep='\t', index_col=0)
    cn_by_patient = cn.transpose()
    cn_by_patient = cn_by_patient.drop(['Entrez_Gene_Id'])
    cn = cn_by_patient[['MYC']]

    data = cn.join(clinical, how='inner')

    analyses = {
        'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'],
    }
    results = pd.DataFrame()
    pp = pprint.PrettyPrinter(indent=2)
    for g in cn:
        for name, a in analyses.iteritems():
            cox_dict = analysis.do_multivariate_cox(data.Time,
                                                    data.Censor,
                                                    data[g],
                                                    data[a],
                                                    float_vars=True)
            cox_dict['gene'] = name + ' ' + g
            results = results.append(cox_dict, ignore_index=True)

    results = results.set_index('gene')
    results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
Example #4
0
def do_cox_models(clinical, cn_file, mut_file, outdir):
    cn = pd.read_csv(cn_file)
    cn_by_patient = cn.transpose()
    cn_by_patient = cn_by_patient.drop(['Chromosome', 'Location'])
    cn_by_patient.columns = cn_by_patient.loc['Symbol']
    cn = cn_by_patient[['\'MYC']]

    mut = mutation_base.prep_mutation_data(mut_file, clinical)
    p53_mut = mut[['\'TP53']]
    p53_mut.columns = ['TP53']

    data = cn.join(clinical, how='inner')
    data = data.join(p53_mut, how='inner')

    analyses = {
        'CNA only': [age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1'],
        'CNA + P53':
        ['TP53', age_r, 'her2_0', 'combined_er_pr', 'stage_0', 'stage_1']
    }
    results = pd.DataFrame()
    pp = pprint.PrettyPrinter(indent=2)
    for g in cn:
        for name, a in analyses.iteritems():
            cox_dict = analysis.do_multivariate_cox(data.time,
                                                    data.censor,
                                                    data[g],
                                                    data[a],
                                                    float_vars=True)
            cox_dict['gene'] = name + ' ' + g
            results = results.append(cox_dict, ignore_index=True)

    cox_dict = analysis.do_multivariate_cox(data.time,
                                            data.censor,
                                            data['TP53'],
                                            data[analyses['CNA only']],
                                            float_vars=True)
    cox_dict['gene'] = 'TP53 mut'
    results = results.append(cox_dict, ignore_index=True)

    results = results.set_index('gene')
    results.T.to_csv(os.path.join(outdir, 'breast_analysis.csv'))
def calculate_cox(data, gene):
    try:
        cox_dict = analysis.do_multivariate_cox(
            data.time,
            data.censor,
            data[gene],  #'\'' + gene.split('_')[0]],
            data[['TP53_mutation']],
            float_vars=True)
        return cox_dict
    except rpy2.rinterface.RRuntimeError as e:
        print 'WARN: skipped', gene, 'due to R error'
        return {}
def calculate_cox(mutation, clinical_data, tumor_stage_file, outdir):
    df = prep_data(mutation, clinical_data)
    df = df.join(clinical_data, how='inner')

    tumor_stage_data = pd.read_csv(tumor_stage_file, index_col=0)
    tumor_stage_cols = [i for i in tumor_stage_data if 'group' in i]
    df = df.join(tumor_stage_data[tumor_stage_cols], how='inner')
    num_patients = len(df.index)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    outfile = os.path.join(
        outdir, cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) +
        '.zscores.out.csv')
    header, formatstring = tumor_stage.tumor_stage_output_header_and_format(
        5, tumor_stage_cols)

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num mutations,num patients')
        out.write(header)
        out.write('\n')

        for gene in df:
            if gene in ['time', 'censor'] + tumor_stage_cols:
                continue
            num_mutations = df[gene].sum()
            if num_mutations >= MUTATION_PERCENT * num_patients:
                analysis_data = pd.DataFrame()
                analysis_data['time'] = df['time']
                analysis_data['censor'] = df['censor']
                analysis_data['mutated'] = df[gene].fillna(0)
                analysis_data[tumor_stage_cols] = df[tumor_stage_cols]

                #Do analysis!
                cox_dict = analysis.do_multivariate_cox(
                    analysis_data['time'], analysis_data['censor'],
                    analysis_data['mutated'], analysis_data[tumor_stage_cols])
                tumor_stage_zscores = tumor_stage.zscores_for_tumor_stage_cols(
                    cox_dict, tumor_stage_cols)

                out.write(
                    formatstring.format(gene, cox_dict['var-z'],
                                        cox_dict['var-p'], num_mutations,
                                        cox_dict['var-n'],
                                        *tumor_stage_zscores))
                analysis_data.to_csv(
                    os.path.join(outdir, gene[1:] + '_data.csv'),
                    columns=['time', 'censor', 'mutated'] + tumor_stage_cols)
def make_zscores(copy_number, clinical_data, tumor_stage_data_dir, outdir):
    cancer_type = util.get_cancer_type(copy_number)

    df = pd.read_csv(copy_number)
    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Symbol']
    clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

    tumor_stage_data, tumor_stage_cols = tumor_stage.prep_tumor_stage_data(
        tumor_stage_data_dir, cancer_type)
    if tumor_stage_data is None:
        return

    clinical_and_cnv_and_extra = clinical_and_cnv.join(
        tumor_stage_data[tumor_stage_cols], how='inner')

    outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv')
    header, formatstring = tumor_stage.tumor_stage_output_header_and_format(
        4, tumor_stage_cols)

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients')
        out.write(header)
        out.write('\n')
        for gene in clinical_and_cnv_and_extra:
            if gene in ['time', 'censor'] + tumor_stage_cols:  # skip metadata
                continue
            if clinical_and_cnv_and_extra[gene].count() > 10:
                cox_dict = analysis.do_multivariate_cox(
                    clinical_and_cnv_and_extra.time,
                    clinical_and_cnv_and_extra.censor,
                    clinical_and_cnv_and_extra[gene],
                    clinical_and_cnv_and_extra[tumor_stage_cols])
                group_zscores = tumor_stage.zscores_for_tumor_stage_cols(
                    cox_dict, tumor_stage_cols)
                out.write(
                    formatstring.format(gene, cox_dict['var-z'],
                                        cox_dict['var-p'], cox_dict['var-n'],
                                        *group_zscores))