예제 #1
0
def main():
    clinical_dir, output_dir, header_file = get_options()
    headers = pd.read_csv(header_file, index_col=0, header=None)
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        purity_header = headers.get_value(cancer_type, 1)

        clinical = util.get_clinical_data(clinical_path,
                                          extra_rows=[purity_header])

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def main():
    clinical_dir, output_dir, extra_data_dir = get_options()
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        if cancer_type == 'COADREAD':
            extra_data = prep_extra_data(extra_data_dir, 'COAD')
        else:
            extra_data = prep_extra_data(extra_data_dir, cancer_type)

        clinical = util.get_clinical_data(clinical_path)
        clinical = clinical.join(extra_data)
        purity_header = 'Purity_InfiniumPurify'

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(
        os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def calculate_cox(mutation, clinical_data, key, outdir):
    df, clinical_data_with_sequenced_patients, num_patients = prep_data(
        mutation, clinical_data, key)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    print cancer_type
    outfile = os.path.join(
        outdir, (cancer_type + '_mutation-fraction-' + str(MUTATION_PERCENT) +
                 '_vaf_cutoff-' + str(VARIANT_ALLELE_FREQ_CUTOFF) +
                 '.zscores.out.csv'))
    formatstring = '\'{0}, {1}, {2}, {3}, {4}\n'

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num mutations,num patients\n')

        #for every gene, collect the clinical data with the mutation data.
        #  only for non-silent mutations
        patients_with_gene = df.groupby(level=u'Hugo_Symbol')
        for gene, gene_df in patients_with_gene:
            # Remove silent mutations
            non_silent = gene_df.where(
                gene_df[u'Variant_Classification'] != 'Silent')
            non_silent = non_silent.dropna(subset=[u'Variant_Classification'])
            mutated_patient_list = non_silent.index.get_level_values(
                'identifier').unique()

            num_mutations = len(mutated_patient_list)

            if num_mutations >= MUTATION_PERCENT * num_patients:
                # Get "effectively mutated" patients: those who's VAF >= median
                median_vaf = non_silent['VAF'].median()
                greater_than_median = non_silent[
                    non_silent['VAF'] >= median_vaf]
                effectively_mutated_patients = greater_than_median.index.get_level_values(
                    'identifier').unique()
                num_effective_mutations = len(effectively_mutated_patients)

                # take the patients with mutations and without, and build an analysis dataframe with time and censor.
                analysis_data = pd.DataFrame(
                    {'mutated': np.ones(num_effective_mutations)},
                    index=effectively_mutated_patients)
                analysis_data = analysis_data.join(
                    clinical_data_with_sequenced_patients, how='right')
                analysis_data['mutated'].fillna(0, inplace=True)

                #Do analysis!
                print 'Doing analysis for ', gene, num_mutations
                time = analysis_data['time']
                censor = analysis_data['censor']
                split = analysis_data['mutated']

                name = cancer_type + '_' + gene
                analysis.do_km(name, time, censor, split, outdir)
                cox_dict = analysis.do_cox(time, censor, split)
                if cox_dict['n'] != len(analysis_data['time']):
                    print 'ERROR'
                out.write(
                    formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                        num_mutations, cox_dict['n']))
                analysis_data.to_csv(os.path.join(outdir, name + '_data.csv'),
                                     columns=['time', 'censor', 'mutated'])
예제 #4
0
def calculate_cox(mutation,
                  clinical,
                  outdir,
                  metagene_file=None,
                  make_km=False):
    clinical_data = util.get_clinical_data(clinical)
    df = mutation_base.prep_mutation_data(mutation, clinical_data)
    clinical_and_data = df.join(clinical_data, how='inner')
    num_patients = len(clinical_and_data)

    #prep output file
    cancer_type = os.path.basename(mutation).split('_')[0].split('.')[0]
    if metagene_file:
        formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '_metagene_zscores.csv')

        print "Processing metagene..."
        metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type)
        print "Complete"
    else:
        outfile = os.path.join(
            outdir, cancer_type + '_mutation-fraction-' +
            str(MUTATION_PERCENT) + '.zscores.out.csv')
        formatstring = '{0}, {1}, {2}, {3}, {4}\n'

    with open(outfile, 'w') as out:
        if metagene_file:
            out.write(
                'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n'
            )
        else:
            out.write('gene,zscore,pvalue,num mutations,num patients\n')

        for gene in clinical_and_data:
            if gene in ['time', 'censor']:
                continue

            num_mutations = int(clinical_and_data[gene].sum())
            if num_mutations >= MUTATION_PERCENT * num_patients:
                time = clinical_and_data['time']
                censor = clinical_and_data['censor']
                data = clinical_and_data[gene]

                if metagene_file:
                    cox_dict = analysis.do_metagene_cox(
                        time, censor, data, metagene)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            cox_dict['metagene-z'],
                                            cox_dict['metagene-p'],
                                            cox_dict['n']))
                else:
                    name = cancer_type + '_' + gene
                    if make_km:
                        analysis.do_km(name, time, censor, data, outdir)
                        clinical_and_data['time', 'censor', gene].to_csv(
                            os.path.join(outdir, name + '_data.csv'),
                            columns=['time', 'censor', 'mutated'])

                    cox_dict = analysis.do_cox(time, censor, data)
                    out.write(
                        formatstring.format(gene, cox_dict['z'], cox_dict['p'],
                                            num_mutations, cox_dict['n']))