def main():
    clinical_dir, output_dir, extra_data_dir = get_options()
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        if cancer_type == 'COADREAD':
            extra_data = prep_extra_data(extra_data_dir, 'COAD')
        else:
            extra_data = prep_extra_data(extra_data_dir, cancer_type)

        clinical = util.get_clinical_data(clinical_path)
        clinical = clinical.join(extra_data)
        purity_header = 'Purity_InfiniumPurify'

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(
        os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def make_zscores(mutation, clinical, breaks, outdir):
    clinical_data = util.get_clinical_data(clinical)
    mut = mutation_base.prep_mutation_data(mutation, clinical_data)

    cancer_type = util.get_cancer_type(mutation)
    print cancer_type

    structural_breaks = pd.read_csv(breaks, index_col=0)
    structural_breaks = structural_breaks.astype(int)
    mut_and_breaks = mut.join(structural_breaks, how='inner')
    num_patients = len(mut_and_breaks)

    results = []
    for gene in mut_and_breaks:
        if gene in ('time', 'censor', 'breaks'):  # skip metadata
            continue
        num_mutations = mut_and_breaks[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_multivariate_cox(mut_and_breaks.time,
                                                    mut_and_breaks.censor,
                                                    mut_and_breaks[gene],
                                                    mut_and_breaks[['breaks']])
            cox_dict['gene'] = gene
            results.append(cox_dict)
    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('gene')
    results_df.to_csv(os.path.join(outdir, cancer_type + '_mut_cox.csv'))
def main(argv=None):
    mutation_dir, clinical_dir, outdir, tumor_stage_dir = get_options()

    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
    p = Pool(16)

    args = []
    for clinical in clinical_files:
        cancer_type = util.get_cancer_type(clinical)
        print cancer_type
        mutation = glob.glob(
            os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

        tumor_stage = os.path.join(tumor_stage_dir,
                                   cancer_type + '_clinical.csv')
        if not os.path.isfile(tumor_stage):
            continue

        clinical_data = util.get_clinical_data(clinical)
        cancer_type_outdir = os.path.join(outdir, cancer_type)
        if not os.path.isdir(cancer_type_outdir):
            os.makedirs(cancer_type_outdir)
        args.append((mutation, clinical_data, tumor_stage, cancer_type_outdir))
        # calculate_cox(mutation, clinical_data, tumor_stage, cancer_type_outdir)

    p.map(multiprocess_zscores, args)
def main(argv=None):
    mutation_dir, clinical_dir, outdir = get_options()

    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
    p = Pool(1)

    args = []
    pancan = {}
    for clinical in clinical_files:
        cancer_type = util.get_cancer_type(clinical)
        print cancer_type
        mutation = glob.glob(
            os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

        clinical_data = util.get_clinical_data(clinical)
        #args.append((mutation, clinical_data, outdir))
        pancan[cancer_type] = calculate_cox(mutation, clinical_data, outdir)

    #print args
    #p.map(multiprocess_zscores, args)
    pancan_df = pd.DataFrame(pancan)
    pancan_df = pancan_df.transpose()
    pancan_df.to_csv(os.path.join(outdir, 'pancan.csv'))
def make_cn_zscores(copy_number, clinical, interesting_genes=None, outdir='.'):
    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)
    cnv_by_patient = cnv.transpose()

    cancer_type = util.get_cancer_type(copy_number)

    relevant_genes = '\'' + interesting_genes.index
    relevant_genes = list(relevant_genes)
    cnv = cnv_by_patient[relevant_genes]

    cnv = cnv.join(clinical_data, how='inner')

    results = []
    for gene in cnv:
        if gene in ('time', 'censor'):  # skip metadata
            continue
        if cnv[gene].count() > 10:
            cnv[gene + '_split'] = np.nan
            cnv.loc[cnv[gene] <= -0.3, gene + '_split'] = -1
            cnv.loc[cnv[gene].between(-0.3, 0.3), gene + '_split'] = 0
            cnv.loc[cnv[gene] >= 0.3, gene + '_split'] = 1

            cox_dict = analysis.do_cox(cnv.time, cnv.censor,
                                       cnv[gene + '_split'])
            cox_dict['gene'] = gene
            cox_dict['cancer_type'] = cancer_type
            results.append(cox_dict)
    cnv.to_csv(os.path.join(outdir, cancer_type + '_trichotomized.csv'))
    return results
Esempio n. 6
0
def make_clinical_data(clinical_file, histologic_subtype_col, outdir):
    cancer_type = util.get_cancer_type(clinical_file)
    clinical = util.get_clinical_data(clinical_file,
                                      extra_rows=[histologic_subtype_col],
                                      extra_rows_numeric=False)
    return save_subtype_files(clinical, histologic_subtype_col, cancer_type,
                              outdir)
def make_zscores(copy_number, mutation, clinical, outdir, genes):
    cancer_type = util.get_cancer_type(copy_number)

    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)

    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)
    p53_mutation = mutation['\'TP53'].rename('TP53_mutation')

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_mutations_and_cnv = clinical_and_cnv.join(p53_mutation,
                                                       how='inner')

    cox_dicts = {}
    for gene in genes['Gene']:
        clinical_gene = clinical_mutations_and_cnv[[
            gene, 'TP53_mutation', 'time', 'censor'
        ]]
        cox_dict = calculate_cox(clinical_gene, gene)
        cox_dict['mutation_count'] = clinical_gene['TP53_mutation'].sum()

        clinical_gene.to_csv(
            os.path.join(
                outdir,
                cancer_type + '_' + gene[1:] + '_p53_and_cna_data.csv'))
        cox_dicts[gene[1:]] = cox_dict
    return cox_dicts
Esempio n. 8
0
def make_clinical_data(clinical_file, tumor_group_file, outdir, grade):
    cancer_type = util.get_cancer_type(clinical_file)
    if grade:
        row = tumor_stage_util.TUMOR_GRADE
        if not cancer_type in tumor_stage_util.TUMOR_GRADE_TYPES:
            return
    else:
        row = tumor_stage_util.TUMOR_STAGE[cancer_type]
    if row:
        tumor_groups = pd.read_csv(tumor_group_file)
        tumor_groups = tumor_groups.dropna(how='all')

        clinical = util.get_clinical_data(clinical_file,
                                          extra_rows=[row],
                                          extra_rows_numeric=False)
        clinical[row] = clinical[row].str.strip()

        included_stages = []
        for i, group in tumor_groups.iterrows():
            tg = group.dropna().values
            if len(tg) > 0:
                print ', '.join(tg) +  ': ', \
                      clinical[clinical[row].isin(tg)][row].count()
                included_stages.extend(tg)
                clinical['group_' + str(i)] = np.where(
                    clinical[row].isin(included_stages), 0, 1)

        clinical = clinical.drop('group_' + str(i), axis=1)
        clinical = clinical[clinical[row].isin(included_stages)]
        clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'),
                        index_label='patient_id')
def make_clinical_data(clinical_file, clinical_variables, outdir):
    clinical = util.get_clinical_data(clinical_file,
                                      extra_rows=[
                                          age_r, breslow_r, gender_r,
                                          ulceration_r, stage_r, clark_r
                                      ],
                                      extra_rows_numeric=False)

    gender_groups = pd.read_csv(os.path.join(clinical_variables,
                                             'SKCM_gender.csv'),
                                dtype=str)
    stage_groups = pd.read_csv(os.path.join(clinical_variables,
                                            'SKCM_stage.csv'),
                               dtype=str)
    clark_groups = pd.read_csv(os.path.join(clinical_variables,
                                            'SKCM_clark.csv'),
                               dtype=str)
    ulceration_groups = pd.read_csv(os.path.join(clinical_variables,
                                                 'SKCM_ulceration.csv'),
                                    dtype=str)

    clinical = tumor_stage_util.group_discontinuous_vars(
        clark_r, 'clark', clark_groups, clinical)
    clinical = tumor_stage_util.group_discontinuous_vars(
        gender_r, 'gender', gender_groups, clinical)

    clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'),
                    index_label='patient_id')
    clinical[age_r] = pd.to_numeric(clinical[age_r], errors='coerce')
    clinical[breslow_r] = pd.to_numeric(clinical[breslow_r], errors='coerce')
    clinical = clinical.dropna(subset=[breslow_r])
    clinical['breslow_0'] = np.where(clinical[breslow_r] <= 1, 0, 1)
    return clinical
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes):
  cancer_type = util.get_cancer_type(copy_number)

  clinical_data = util.get_clinical_data(clinical)
  cnv = pd.read_csv(copy_number, index_col=0)
  cnv_by_patient = cnv.transpose()


  rnaseq =  pd.read_csv(rnaseq, low_memory=False, sep='\t')
  rnaseq = rnaseq.drop([0])
  rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float)
  rnaseq = rnaseq.transpose().reset_index()
  rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type)
  rnaseq = util.add_identifier_column(rnaseq, 'index')
  rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float)
  rnaseq_log2 = rnaseq_clean.apply(np.log2)
  rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf)
  rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner')

  mutation = mutation_base.prep_mutation_data(mutation, clinical_data)
  print mutation.index

  included_patients = set(list(mutation.index)) & set(list(rna_cnv.index))

  rna_cnv = rna_cnv.loc[included_patients]

  rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv'))

  corr_dict = {}
  for gene in genes['Gene']:
    corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene'])
    corr_dict[cancer_type + '_' + gene] = corr

  return pd.DataFrame(corr_dict)
Esempio n. 11
0
def main():
    copy_number_loc, clinical, outdir = get_options()
    cnas = os.listdir(copy_number_loc)
    cnas = util.remove_extraneous_files(cnas)

    results = pd.DataFrame()
    for c in cnas:
        cancer_type = util.get_cancer_type(c)
        print cancer_type

        clinical_file = glob.glob(
            os.path.join(clinical, '*' + cancer_type + '*.txt'))[0]
        clin = util.get_clinical_data(clinical_file)

        patient_breaks = count_breaks(os.path.join(copy_number_loc, c))
        patient_breaks = patient_breaks.reset_index()
        patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample',
                                                  cancer_type)
        patient_breaks = util.add_identifier_column(patient_breaks, 'Sample')
        patient_breaks = patient_breaks.set_index('identifier')
        patient_breaks = patient_breaks.drop('Sample', axis=1)

        breaks_and_clin = patient_breaks.join(clin, how='inner')
        breaks_and_clin.to_csv(
            os.path.join(outdir, cancer_type + '_breaks.csv'))
        cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor,
                              breaks_and_clin.breaks)
        cox['cancer_type'] = cancer_type
        results = results.append(cox, ignore_index=True)

    results.to_csv(os.path.join(outdir, 'cox_results.csv'))
Esempio n. 12
0
def make_clinical_data(clinical_file, clinical_variables, outdir):
    clinical = util.get_clinical_data(
        clinical_file,
        extra_rows=[age_r, er_r, pr_r, her2_r, stage_r],
        extra_rows_numeric=False)

    stage_groups = pd.read_csv(os.path.join(clinical_variables,
                                            'BRCA_stage.csv'),
                               dtype=str)
    er_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_er.csv'),
                            dtype=str)
    pr_groups = pd.read_csv(os.path.join(clinical_variables, 'BRCA_pr.csv'),
                            dtype=str)
    her2_groups = pd.read_csv(os.path.join(clinical_variables,
                                           'BRCA_her2.csv'),
                              dtype=str)

    clinical = tumor_stage_util.group_discontinuous_vars(
        stage_r, 'stage', stage_groups, clinical)
    clinical = tumor_stage_util.group_discontinuous_vars(
        er_r, 'er', er_groups, clinical)
    clinical = tumor_stage_util.group_discontinuous_vars(
        pr_r, 'pr', pr_groups, clinical)
    clinical = tumor_stage_util.group_discontinuous_vars(
        her2_r, 'her2', her2_groups, clinical)

    clinical['combined_er_pr'] = np.where(clinical['er_0'] & clinical['pr_0'],
                                          1, 0)

    clinical.to_csv(os.path.join(outdir, cancer_type + '_clinical.csv'),
                    index_label='patient_id')
    clinical[age_r] = pd.to_numeric(clinical[age_r], errors='coerce')
    return clinical
def main(argv=None):
  mutation_dir, clinical_dir, outdir, univariate_output = get_options()

  clinical_files = os.listdir(clinical_dir)
  clinical_files = util.remove_extraneous_files(clinical_files)
  clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
  p = Pool(16)

  args = []
  for clinical in clinical_files:
    cancer_type = util.get_cancer_type(clinical)
    print cancer_type
    mutation = glob.glob(os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

    univariate_file = None
    if univariate_output:
      univariate_file = glob.glob(os.path.join(univariate_output, cancer_type, cancer_type + '.zscores.out.csv'))[0]
      print univariate_file

    clinical_data = util.get_clinical_data(clinical)
    cancer_type_outdir = os.path.join(outdir, cancer_type)
    if not os.path.isdir(cancer_type_outdir):
      os.makedirs(cancer_type_outdir)
    args.append((mutation, clinical_data, cancer_type_outdir, univariate_file))
    # calculate_cox(mutation, clinical_data, cancer_type_outdir, univariate_file=univariate_file)

  print args
  p.map(multiprocess_zscores, args)
Esempio n. 14
0
def main():
    clinical_dir, output_dir, header_file = get_options()
    headers = pd.read_csv(header_file, index_col=0, header=None)
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        purity_header = headers.get_value(cancer_type, 1)

        clinical = util.get_clinical_data(clinical_path,
                                          extra_rows=[purity_header])

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
Esempio n. 15
0
def calculate_cox_for_cancer_type(requested_data, mutation_data, outdir):
    cancer_type = util.get_cancer_type(mutation_data)
    clinical = os.path.join('.', 'clinical', cancer_type + '.clin.merged.txt')
    clinical_data = util.get_clinical_data(clinical)

    start_pos = None
    if cancer_type in ['COADREAD', 'OV']:
        folder = os.path.dirname(mutation_data)
        mutation_data = os.path.join(folder, 'HG36_HG37',
                                     cancer_type + '_hg36_hg37.txt')
        start_pos = u'hg37_start'

    df, clinical_with_sequenced_patients, num_patients = zscores_for_mutants.prep_data(
        mutation_data, clinical_data)
    if not start_pos:
        upper_columns = [i.upper() for i in df.columns]
        start_pos_index = upper_columns.index('START_POSITION')
        start_pos = df.columns[start_pos_index]

    patients_with_gene = df.groupby(level=u'Hugo_Symbol')
    output_data = []
    for i, request in requested_data.iteritems():
        gene = i[1:]
        # print gene
        # print request
        if gene in patients_with_gene.groups.keys():
            patients_with_requested_gene = patients_with_gene.get_group(gene)
            mutated_at_positions = patients_with_requested_gene[
                start_pos].isin(request)
            # print mutated_at_positions
            patients_with_requested_positions = patients_with_requested_gene[
                mutated_at_positions]
            ids_with_requested_positions = patients_with_requested_positions.index.get_level_values(
                'identifier')
            if len(
                    ids_with_requested_positions
            ) >= MUTATION_PERCENT * clinical_with_sequenced_patients.shape[0]:
                analysis_data = pd.DataFrame(
                    {'mutated': np.ones(len(ids_with_requested_positions))},
                    index=ids_with_requested_positions)
                analysis_data = analysis_data.join(
                    clinical_with_sequenced_patients, how='right')
                analysis_data['mutated'].fillna(0, inplace=True)
                cox_dict = analysis.do_cox(analysis_data['time'],
                                           analysis_data['censor'],
                                           analysis_data['mutated'])

                outdict = {cancer_type + ' p': cox_dict['p']}
                outdict[cancer_type + ' z'] = cox_dict['z']
                outdict[cancer_type +
                        ' mutants'] = len(ids_with_requested_positions)
                outdict[cancer_type + ' n'] = cox_dict['n']
                outdict['gene'] = i
                outdict['positions'] = ':'.join(request)
                output_data.append(outdict)
    outdata = pd.DataFrame(output_data)
    print outdata
    if len(outdata):
        outdata = outdata.set_index(['gene', 'positions'])
    return outdata
Esempio n. 16
0
def make_mutation_zscores(mutation, clinical, gene_list):
    cancer_type = util.get_cancer_type(mutation)

    # get mutation patients
    clinical_data = util.get_clinical_data(clinical)
    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    present_gene_list = list(
        set(gene_list.values) & set(mutation.columns.values))
    mutation_gene_list_only = mutation[present_gene_list]

    mutation_and_clinical = mutation_gene_list_only.join(clinical_data,
                                                         how='inner')
    num_patients = len(mutation_and_clinical.index)

    results = pd.DataFrame()
    for gene in mutation_and_clinical:
        if gene in ['time', 'censor']:
            continue
        num_mutations = mutation_and_clinical[gene].sum()
        if num_mutations >= MUTATION_PERCENT * num_patients:
            cox_dict = analysis.do_cox(mutation_and_clinical.time,
                                       mutation_and_clinical.censor,
                                       mutation_and_clinical[gene])
            cox_dict['cancer_type'] = cancer_type
            cox_dict['gene'] = gene
            cox_dict['num_mutations'] = num_mutations
            results = results.append(cox_dict, ignore_index=True)
    print results
    return results
def main(argv=None):
    if argv is None:
        argv = sys.argv
        mutation, clinical, outdir, key_file = get_options(argv)
        key = pd.read_csv(key_file, index_col=0, na_values=['-'])
        key = key.dropna(how='all')

        cancer_type = util.get_cancer_type(mutation)
        if cancer_type in key.index:
            clinical_data = util.get_clinical_data(clinical)
            if not os.path.isdir(outdir):
                os.makedirs(outdir)
            calculate_cox(mutation, clinical_data, key, outdir)
Esempio n. 18
0
def make_zscores(copy_number, mutation, clinical, outdir):
    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)
    mutation = prep_mutation_data(mutation, clinical_data)

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_and_mutation_patients = list(
        set(mutation.index).intersection(set(clinical_and_cnv.index)))
    clinical_and_cnv_with_mutations = clinical_and_cnv.loc[
        clinical_and_mutation_patients]

    cancer_type = util.get_cancer_type(copy_number)
    outfile = os.path.join(outdir,
                           cancer_type + '.cnv_with_mutation_zscores.csv')
    formatstring = '{0}, {1}, {2}, {3}, {4}, {5}, {6}\n'

    with open(outfile, 'w') as out:
        out.write(
            'gene,mutated zscore,mutated pvalue,mutated patients,non-mutated zscore, non-mutated pvalue, non-mutated patients\n'
        )
        for gene in clinical_and_cnv_with_mutations:
            if gene not in ('time', 'censor'):  # skip metadata
                clinical_gene = clinical_and_cnv_with_mutations[[
                    gene, 'time', 'censor'
                ]]

                if gene in mutation:
                    mutations_for_gene = mutation[gene].rename('mutation')
                    with_mutation = clinical_gene.join(
                        mutations_for_gene.dropna(), how='inner')
                    without_mutation = clinical_gene.join(
                        mutations_for_gene[mutations_for_gene != 1],
                        how='inner')
                else:
                    with_mutation = pd.DataFrame({gene: []})
                    without_mutation = clinical_gene

                without_mutation_cox_dict = calculate_cox(
                    without_mutation, gene)
                with_mutation_cox_dict = calculate_cox(with_mutation, gene)
                out.write(
                    formatstring.format(gene, with_mutation_cox_dict['z'],
                                        with_mutation_cox_dict['p'],
                                        with_mutation_cox_dict['n'],
                                        without_mutation_cox_dict['z'],
                                        without_mutation_cox_dict['p'],
                                        without_mutation_cox_dict['n']))
Esempio n. 19
0
def make_zscores(copy_number, clinical, outdir, metagene_file=None):
    clinical_data = util.get_clinical_data(clinical)

    df = pd.read_csv(copy_number)
    df = df.drop(['Chromosome', 'Location'], axis=1)
    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Symbol']
    clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

    cancer_type = util.get_cancer_type(copy_number)
    if metagene_file:
        formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
        outfile = os.path.join(outdir, cancer_type + '_metagene_zscores.csv')

        print "Processing metagene..."
        metagene = metagene_lib.get_metagene_data(metagene_file, cancer_type)
        print "Complete"
    else:
        outfile = os.path.join(outdir, cancer_type + '_zscores.csv')
        formatstring = '{0}, {1}, {2}, {3}\n'

    with open(outfile, 'w') as out:
        if metagene_file:
            out.write(
                'gene,zscore,pvalue,metagene-zscore,metagene-pvalue,num patients\n'
            )
        else:
            out.write('gene,zscore,pvalue,num patients\n')
        for gene in clinical_and_cnv:
            if gene not in ('time', 'censor'):  # skip metadata
                if clinical_and_cnv[gene].count() > 10:
                    if metagene_file:
                        cox_dict = analysis.do_metagene_cox(
                            clinical_and_cnv.time, clinical_and_cnv.censor,
                            clinical_and_cnv[gene], metagene)
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'],
                                                cox_dict['metagene-z'],
                                                cox_dict['metagene-p'],
                                                cox_dict['n']))
                    else:
                        cox_dict = analysis.do_cox(clinical_and_cnv.time,
                                                   clinical_and_cnv.censor,
                                                   clinical_and_cnv[gene])
                        out.write(
                            formatstring.format(gene, cox_dict['z'],
                                                cox_dict['p'], cox_dict['n']))
def copy_number_changes(cnv, clinical,  outdir, cancer_type_genes):
  cancer_type = util.get_cancer_type(cnv)
  print cancer_type
  clinical = util.get_clinical_data(clinical)
  copy_numbers = pd.read_csv(cnv, index_col=0)


  for i, gene in cancer_type_genes.iterrows():
    results = pd.DataFrame()

    gene_name = gene['Gene']
    print gene_name
    gene_cnas = copy_numbers.loc[gene_name]
    chrom = gene_cnas['Chromosome']
    gene_location = copy_numbers.loc[gene_name]['Location']


    if gene['Type'] == 'Amplification':
      threshold_passed = gene_cnas > 0.3
    else:
      threshold_passed = gene_cnas < -0.3
    threshold_passed = threshold_passed.drop(['Chromosome', 'Location'])
    threshold_passed = threshold_passed[threshold_passed]

    copy_numbers_on_same_chrom = copy_numbers[copy_numbers['Chromosome'] == chrom]
    for patient in copy_numbers_on_same_chrom:
      if patient not in clinical.index:
        continue
      if patient in ['Chromosome', 'Location']:
        continue
      if patient in threshold_passed.index:
        patient_data = copy_numbers_on_same_chrom[['Location', patient]]
        patient_data = patient_data.reset_index().sort_values(by='Location') \
                                    .set_index('Location').drop('Symbol')
        continuous, total = find_continuous_region(patient_data[patient],
                                                  starting_at=gene_location,
                                                  alteration_type=gene['Type'])
      else:
        continuous, total = (None, None)
      results[patient] = pd.Series({'continuous_len': continuous,
                          'chr_len': total,
                          'fraction': continuous/total if continuous else None,
                          'copy number': gene_cnas[patient],
                          'time': clinical.loc[patient].time,
                          'censor': clinical.loc[patient].censor})
    results.transpose().to_csv(os.path.join(outdir, cancer_type + '_' + gene_name[1:] + '.cn_changes.csv'),
                            columns=['time', 'censor', 'copy number', 'fraction', 'continuous_len', 'chr_len'])
Esempio n. 21
0
def main():
    indir, outdir = get_options()
    clinical_files = os.listdir(indir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    stage_row = 'patient.stage_event.pathologic_stage'

    for clinical_f in clinical_files:
        f = os.path.join(indir, clinical_f)
        cancer_type = util.get_cancer_type(clinical_f)
        stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type]
        if stage_row:
            clinical = util.get_clinical_data(f,
                                              extra_rows=[stage_row],
                                              extra_rows_numeric=False)
            clinical[stage_row] = clinical[stage_row].str.strip()
            print cancer_type
            print clinical[stage_row].value_counts()
def count_tumor_groups(clinical_file, tumor_group_file):
  cancer_type = util.get_cancer_type(clinical_file)
  stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type]
  if stage_row:
    tumor_groups = pd.read_csv(tumor_group_file)
    clinical = util.get_clinical_data(clinical_file, extra_rows=[stage_row], extra_rows_numeric=False)
    clinical[stage_row] = clinical[stage_row].str.strip()

    included_stages = []
    for i, group in tumor_groups.iterrows():
      tg = group.dropna().values
      if len(tg) > 0:
        print ', '.join(tg) +  ': ', \
              clinical[clinical[stage_row].isin(tg)][stage_row].count()
        included_stages.extend(tg)
      excluded_patients = clinical[~clinical[stage_row].isin(included_stages)]
    print 'Excluded:'
    print excluded_patients[stage_row].value_counts()
Esempio n. 23
0
def main():
    clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options(
    )
    files = os.listdir(clinical_dir)
    files = util.remove_extraneous_files(files)
    clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

    row_names = pd.read_csv(row_names_file, header=0)

    interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1)

    for i, row in row_names.iterrows():
        cancer_type = row['cancer_type']
        cancer_type_fname = cancer_type
        print cancer_type
        clinical_file = clinical_by_cancer_type[cancer_type]
        clinical_file = os.path.join(clinical_dir, clinical_file)

        if row['histological_subtype_row'] != 'EXTERNAL':
            clinical_data = make_clinical_data(clinical_file,
                                               row['histological_subtype_row'],
                                               outdir)
        else:
            subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
            # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
            clinical = util.get_clinical_data(clinical_file)
            subtype_clinical = clinical.join(subtype_data['subtype'],
                                             how='outer')
            clinical_data = save_subtype_files(subtype_clinical, 'subtype',
                                               cancer_type, outdir)
            cancer_type_fname = 'BRCA_HER2'

        cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0]
        cna = pd.read_csv(cna_file, header=0, index_col=0).T
        genes = '\'' + interesting_genes['Gene']
        genes = genes.loc[cancer_type]
        print genes
        if type(genes) == str:
            print cna[[genes]]
            joined = cna[[genes]].join(clinical_data, how='outer')
        else:
            joined = cna[genes].join(clinical_data, how='outer')
        joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
def main(argv=None):
    if argv is None:
        argv = sys.argv
        input_directory, clinical, outdir, extra_data_dir = get_options()
        clinical_files = os.listdir(clinical)
        clinical_files = util.remove_extraneous_files(clinical_files)

        args = []
        for c in clinical_files:
            cancer_type = util.get_cancer_type(c)
            print cancer_type

            clinical_data = util.get_clinical_data(os.path.join(clinical, c))
            copy_number = glob.glob(
                os.path.join(input_directory, cancer_type + '*.csv'))[0]

            args.append((copy_number, clinical_data, extra_data_dir, outdir))
            # make_zscores(copy_number, clinical_data, extra_data_dir, outdir)
        p = Pool(4)
        p.map(multiprocess_zscores, args)
def make_zscores(copy_number, mutation, clinical, outdir, genes):
    cancer_type = util.get_cancer_type(copy_number)

    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)

    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    for g in genes['Gene']:
        if g not in mutation.columns:
            mutation[g] = 0
            print mutation[g]

    mutations = mutation[genes['Gene']]

    # cox multivariate won't work if there's a quote in the multivar name, so remove it
    gene_names = [x[1:] + '_mutations' for x in genes['Gene']]
    mutations.columns = gene_names

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner')

    cox_dicts = {}
    for gene in gene_names:
        plain_gene_name = gene.split('_')[0]
        # little shenanigans to make the names work. CNAs still have a quote, and
        # mutations have a suffix
        clinical_gene = clinical_mutations_and_cnv[[
            '\'' + plain_gene_name, gene, 'time', 'censor'
        ]]
        cox_dict = calculate_cox(clinical_gene, gene)
        cox_dict['mutation_count'] = clinical_gene[gene].sum()

        clinical_gene.to_csv(
            os.path.join(
                outdir, cancer_type + '_' + plain_gene_name +
                '_mutation_and_cna_data.csv'))
        cox_dicts[plain_gene_name] = cox_dict
    return cox_dicts
def make_zscores(data, clinical, hypermutated_patients, outdir):
  clinical_data = util.get_clinical_data(clinical)
  hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients'])
  print 'Hypermutated in clinical file:', len(hypermutated)
  clinical_data = clinical_data.drop(hypermutated)

  cancer_type = util.get_cancer_type(data)
  df = mb.prep_mutation_data(data, clinical_data)

  print 'Remaining hypermutated:', set(df.index).intersection(hypermutated)
  num_patients = len(set(clinical_data.index) & set(df.index))
  print 'Number of patients present in both:', num_patients

  clinical_and_data = df.join(clinical_data, how='inner')
  print 'Num patients, other count:', len(df.index)

  outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'

  zscore_count = 0
  zscore_skipped = 0
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients,num mutations\n')
    for gene in clinical_and_data:
      if gene not in ('time', 'censor', 'index'): # skip metadata
        num_mutations = clinical_and_data[gene].sum()
        # print gene, num_mutations
        if num_mutations >= MUTATION_PERCENT * num_patients:
          try:
            cox_dict = analysis.do_cox(clinical_and_data.time,
                                       clinical_and_data.censor,
                                       clinical_and_data[gene])
            out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations))
            zscore_count += 1
          except rpy2.rinterface.RRuntimeError as e:
            print 'WARN: skipped ', gene, ' due to R error'
            zscore_skipped += 1
            continue
        else:
          zscore_skipped += 1
          continue
Esempio n. 27
0
def main():
  clinical_dir, row_names_file, outdir = get_options()
  files = os.listdir(clinical_dir)
  files = util.remove_extraneous_files(files)
  clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

  row_names = pd.read_csv(row_names_file, header=0)

  for i, row  in row_names.iterrows():
    cancer_type = row['cancer_type']
    print cancer_type
    clinical_file = clinical_by_cancer_type[cancer_type]
    clinical_file = os.path.join(clinical_dir, clinical_file)
    if row['histological_subtype_row'] != 'EXTERNAL':
      make_clinical_data(clinical_file, row['histological_subtype_row'], outdir)
    else:
      subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
      subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
      clinical = util.get_clinical_data(clinical_file)
      subtype_clinical = clinical.join(subtype_data['subtype'], how='outer')
      save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)
Esempio n. 28
0
def main(argv=None):
  if argv is None:
    argv = sys.argv
    input_directory, clinical, outdir, extra_clinical_rows_file = get_options()
    clinical_files = os.listdir(clinical)
    clinical_files = util.remove_extraneous_files(clinical_files)

    all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None)

    for c in clinical_files:
      cancer_type = util.get_cancer_type(c)
      extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]]
      print cancer_type
      clinical_data = util.get_clinical_data(os.path.join(clinical, c),
                                             extra_rows=extra_rows)
      print clinical_data

      copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0]
      print copy_number

      make_zscores(copy_number, clinical_data, outdir, extra_rows)
def main(argv=None):
    if argv is None:
        argv = sys.argv
        input_directory, clinical, outdir, extra_data_dir = get_options()
        clinical_files = os.listdir(clinical)
        clinical_files = util.remove_extraneous_files(clinical_files)
        extra_data_col = 'Purity_InfiniumPurify'

        for c in clinical_files[3:]:
            cancer_type = util.get_cancer_type(c)
            print cancer_type

            if cancer_type == 'COADREAD':
                extra_data = prep_extra_data(extra_data_dir, 'COAD')
            else:
                extra_data = prep_extra_data(extra_data_dir, cancer_type)
            clinical_data = util.get_clinical_data(os.path.join(clinical, c))

            copy_number = glob.glob(
                os.path.join(input_directory, cancer_type + '*.csv'))[0]

            make_zscores(copy_number, clinical_data, outdir, extra_data,
                         extra_data_col)
Esempio n. 30
0
def make_cnv_zscores(copy_number, clinical, gene_list):
    cancer_type = util.get_cancer_type(copy_number)

    cna = pd.read_csv(copy_number)
    cna_by_patient = cna.transpose()
    cna_by_patient.columns = cna_by_patient.loc['Symbol']
    cna_by_patient_gene_list_only = cna_by_patient[gene_list]

    clinical_data = util.get_clinical_data(clinical)
    clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data,
                                                          how='inner')

    results = pd.DataFrame()
    for gene in clinical_and_cnv:
        if gene in ['time', 'censor']:
            continue
        cox_dict = analysis.do_cox(clinical_and_cnv.time,
                                   clinical_and_cnv.censor,
                                   clinical_and_cnv[gene])
        cox_dict['cancer_type'] = cancer_type
        cox_dict['gene'] = gene
        results = results.append(cox_dict, ignore_index=True)
    return results