def copy_number_changes(cnv, clinical,  outdir, cancer_type_genes):
  cancer_type = util.get_cancer_type(cnv)
  print cancer_type
  clinical = util.get_clinical_data(clinical)
  copy_numbers = pd.read_csv(cnv, index_col=0)


  for i, gene in cancer_type_genes.iterrows():
    results = pd.DataFrame()

    gene_name = gene['Gene']
    print gene_name
    gene_cnas = copy_numbers.loc[gene_name]
    chrom = gene_cnas['Chromosome']
    gene_location = copy_numbers.loc[gene_name]['Location']


    if gene['Type'] == 'Amplification':
      threshold_passed = gene_cnas > 0.3
    else:
      threshold_passed = gene_cnas < -0.3
    threshold_passed = threshold_passed.drop(['Chromosome', 'Location'])
    threshold_passed = threshold_passed[threshold_passed]

    copy_numbers_on_same_chrom = copy_numbers[copy_numbers['Chromosome'] == chrom]
    for patient in copy_numbers_on_same_chrom:
      if patient not in clinical.index:
        continue
      if patient in ['Chromosome', 'Location']:
        continue
      if patient in threshold_passed.index:
        patient_data = copy_numbers_on_same_chrom[['Location', patient]]
        patient_data = patient_data.reset_index().sort_values(by='Location') \
                                    .set_index('Location').drop('Symbol')
        continuous, total = find_continuous_region(patient_data[patient],
                                                  starting_at=gene_location,
                                                  alteration_type=gene['Type'])
      else:
        continuous, total = (None, None)
      results[patient] = pd.Series({'continuous_len': continuous,
                          'chr_len': total,
                          'fraction': continuous/total if continuous else None,
                          'copy number': gene_cnas[patient],
                          'time': clinical.loc[patient].time,
                          'censor': clinical.loc[patient].censor})
    results.transpose().to_csv(os.path.join(outdir, cancer_type + '_' + gene_name[1:] + '.cn_changes.csv'),
                            columns=['time', 'censor', 'copy number', 'fraction', 'continuous_len', 'chr_len'])
def main(argv=None):
    cnv_dir, mutation_dir, clinical_dir, outdir, input_file = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    interesting_genes = pd.read_csv(input_file, comment='#')
    print interesting_genes
    interesting_genes['Gene'] = '\'' + interesting_genes['Gene']

    zscore_inputs = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        cancer_type_genes = interesting_genes[interesting_genes['Cancer Type']
                                              == cancer_type]
        if len(cancer_type_genes) == 0:
            continue
        print cancer_type
        print cancer_type_genes
        mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0]
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        zscore_inputs.append(
            [cnv, mutation, clinical, outdir, cancer_type_genes])
        #multiprocess_zscores([cnv, mutation, clinical, outdir, cancer_type_genes])

    p = Pool(4)
    results = p.map(multiprocess_zscores, zscore_inputs)
    print results
    with open(os.path.join(outdir, 'cox_results.csv'), 'w') as out:
        formatstr = '{},{},{},{},{},{},{},{}\n'
        out.write(
            'Cancer Type,Gene,CNA Z Score, CNA P value, Mutation Z score, Mutation P Value, Mutation Count, n\n'
        )
        for coxs in results:
            cancer_type = coxs.keys()[0]
            print cancer_type
            for gene, cox_dict in coxs[cancer_type].iteritems():
                print gene, cox_dict
                out.write(
                    formatstr.format(cancer_type, gene, cox_dict['var-z'],
                                     cox_dict['var-p'],
                                     cox_dict[gene + '_mutations-z'],
                                     cox_dict[gene + '_mutations-p'],
                                     cox_dict['mutation_count'],
                                     cox_dict['var-n']))
Ejemplo n.º 3
0
def pancan_fdr(directory, files, outname):
    pancan_fdr = pd.DataFrame()

    for f in files:
        cancer_type = util.get_cancer_type(f).split('_')[0]
        print cancer_type
        cancer_type_df = single_zscore_file_fdr(f)
        cancer_type_df = cancer_type_df.add_prefix(cancer_type + ' ')
        pancan_fdr = pd.concat((pancan_fdr, cancer_type_df), axis=1)

    stouffer_fdr_df = stouffer_fdr(os.path.join(directory, 'pancan.csv'))
    stouffer_fdr_df = stouffer_fdr_df.add_prefix('pancan ')
    pancan_fdr = pd.concat((pancan_fdr, stouffer_fdr_df),
                           axis=1,
                           verify_integrity=True)

    pancan_fdr.to_csv(os.path.join(directory, outname))
Ejemplo n.º 4
0
def main(argv=None):
    mutation_dir, key_file, outdir = get_options()
    mutation_files = glob.glob(mutation_dir + '*txt')
    key = pd.read_csv(key_file, na_values=['-'], index_col=0)
    key = key.dropna(how='all')
    print key

    p = Pool(1)

    args = []
    pancan = {}
    for mutation in mutation_files:
        cancer_type = util.get_cancer_type(mutation)
        if cancer_type in key.index:
            print cancer_type
            pancan[cancer_type] = calculate_variant_allele_distribution(
                cancer_type, mutation, key, outdir)
Ejemplo n.º 5
0
def main():
    indir, outdir = get_options()
    clinical_files = os.listdir(indir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    stage_row = 'patient.stage_event.pathologic_stage'

    for clinical_f in clinical_files:
        f = os.path.join(indir, clinical_f)
        cancer_type = util.get_cancer_type(clinical_f)
        stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type]
        if stage_row:
            clinical = util.get_clinical_data(f,
                                              extra_rows=[stage_row],
                                              extra_rows_numeric=False)
            clinical[stage_row] = clinical[stage_row].str.strip()
            print cancer_type
            print clinical[stage_row].value_counts()
def count_tumor_groups(clinical_file, tumor_group_file):
  cancer_type = util.get_cancer_type(clinical_file)
  stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type]
  if stage_row:
    tumor_groups = pd.read_csv(tumor_group_file)
    clinical = util.get_clinical_data(clinical_file, extra_rows=[stage_row], extra_rows_numeric=False)
    clinical[stage_row] = clinical[stage_row].str.strip()

    included_stages = []
    for i, group in tumor_groups.iterrows():
      tg = group.dropna().values
      if len(tg) > 0:
        print ', '.join(tg) +  ': ', \
              clinical[clinical[stage_row].isin(tg)][stage_row].count()
        included_stages.extend(tg)
      excluded_patients = clinical[~clinical[stage_row].isin(included_stages)]
    print 'Excluded:'
    print excluded_patients[stage_row].value_counts()
def count_codons(data, outdir):
    files = os.listdir(data)
    files = util.remove_extraneous_files(files)
    files.remove('HG36_HG37')

    outdata = []
    ncbi_outdata = []
    for f in files:
        file_name = os.path.join(data, f)
        cancer_type = util.get_cancer_type(file_name)
        codon_counts = count_codons_in_file(file_name)
        outdata.append(codon_counts)
    df = pd.concat(outdata, axis=1, verify_integrity=True)
    df['sum'] = df.sum(axis=1)
    df.to_csv('codon_counts.csv',
              index_label=[
                  'Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'
              ])
def main(argv=None):
    if argv is None:
        argv = sys.argv
        mutation, clinical, outdir, metagene_file, key_file = get_options(argv)
        key = pd.read_csv(key_file, index_col=0, na_values=['-'])
        key = key.dropna(how='all')
        print key

        cancer_type = util.get_cancer_type(mutation)
        if cancer_type in key.index:
            clinical_data = util.get_clinical_data(clinical)
            if not os.path.isdir(outdir):
                os.makedirs(outdir)
            calculate_cox(mutation,
                          clinical_data,
                          key,
                          outdir,
                          metagene_file=metagene_file)
def main(argv=None):
  cnv_dir, clinical_dir, outdir, input_file = get_options()
  cnv_files = os.listdir(cnv_dir)
  cnv_files = util.remove_extraneous_files(cnv_files)
  cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

  interesting_genes = pd.read_csv(input_file, comment='#')
  interesting_genes['Gene'] = '\'' + interesting_genes['Gene']

  zscore_inputs = []
  for cnv in cnv_files:
    cancer_type = util.get_cancer_type(cnv)
    cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type]
    if len(cancer_type_genes) == 0:
      continue

    clinical = glob.glob(os.path.join(clinical_dir, cancer_type + '*'))[0]
    multiprocess_copy_number_changes([cnv, clinical, outdir, cancer_type_genes])
def make_zscores(data, clinical, outdir):
  subtype = clinical.split('.')[1]
  print clinical
  clinical_data = pd.read_csv(clinical, index_col=0, header=0)
  print clinical_data
  clinical_data = clinical_data.dropna(subset=['time', 'censor'], how='any')
  subtype_col = clinical_data.columns[-1]
  print subtype_col

  cancer_type = util.get_cancer_type(data)
  df = prep_data(data)
  print df

  print cancer_type
  print 'Number of patients present in both:', len(set(clinical_data.index) & set(df.index))

  clinical_and_data = df.join(clinical_data, how='inner')

  outfile = os.path.join(outdir, cancer_type + '_' + subtype + '_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}\n'

  zscore_count = 0
  zscore_skipped = 0
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients\n')
    for gene in clinical_and_data:
      if gene not in ('time', 'censor', 'index', subtype_col): # skip metadata
        if clinical_and_data[gene].count() <= 10:
          zscore_skipped += 1
          continue
        try:
          cox_dict = analysis.do_cox(clinical_and_data.time,
                                     clinical_and_data.censor,
                                     clinical_and_data[gene])
          out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n']))
          zscore_count += 1
        except rpy2.rinterface.RRuntimeError as e:
          print 'WARN: skipped ', gene, ' due to R error'
          zscore_skipped += 1
          continue

    print 'Total:', clinical_and_data.shape[1] - 3 # minus time, censor, index
    print 'Output length:', zscore_count
    print 'Skipped:', zscore_skipped
def count_codons_in_file(f):
    cancer_type = util.get_cancer_type(f)
    print cancer_type

    df = pd.read_csv(f, sep='\t', low_memory=False)
    # Some of the columns are named Start_position. Others are Start_Position. some are start_position. :|
    upper_columns = [i.upper() for i in df.columns]
    start_pos_index = upper_columns.index('START_POSITION')
    start_pos = df.columns[start_pos_index]
    chromosome = u'Chromosome'

    ncbi_builds = df[u'NCBI_Build'].value_counts()
    if '36' in ncbi_builds.index:
        print 'Using translated NCBI build', cancer_type
        folder = os.path.dirname(f)
        new_path = os.path.join(folder, 'HG36_HG37',
                                cancer_type + '_hg36_hg37.txt')
        print new_path
        df = pd.read_csv(new_path, sep='\t', dtype=str)
        start_pos = u'hg37_start'
        chromsome = u'hg37_chr'
    wild_type_allele_col = u'Reference_Allele'

    df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()
    df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
    df = df[df[u'Variant_Classification'].str.contains(
        'Missense')]  # only include missense
    df[u'Hugo_Symbol'] = '\'' + df[u'Hugo_Symbol'].astype(str)
    df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')

    # Some files have the same mutation from different samples listed under one patient,
    # we only care about the number of patients with a given mutation, so drop duplicates
    df = df.drop_duplicates(
        subset=[u'Hugo_Symbol', chromosome, start_pos, u'identifier'],
        keep='last')
    counts = df.groupby(
        [u'Hugo_Symbol', chromosome, start_pos, wild_type_allele_col]).size()
    count_df = pd.DataFrame(counts)
    count_df.columns = [cancer_type]
    count_df.index.rename(
        ['Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'],
        inplace=True)
    return count_df
Ejemplo n.º 12
0
def main():
    clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options(
    )
    files = os.listdir(clinical_dir)
    files = util.remove_extraneous_files(files)
    clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

    row_names = pd.read_csv(row_names_file, header=0)

    interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1)

    for i, row in row_names.iterrows():
        cancer_type = row['cancer_type']
        cancer_type_fname = cancer_type
        print cancer_type
        clinical_file = clinical_by_cancer_type[cancer_type]
        clinical_file = os.path.join(clinical_dir, clinical_file)

        if row['histological_subtype_row'] != 'EXTERNAL':
            clinical_data = make_clinical_data(clinical_file,
                                               row['histological_subtype_row'],
                                               outdir)
        else:
            subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
            # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
            clinical = util.get_clinical_data(clinical_file)
            subtype_clinical = clinical.join(subtype_data['subtype'],
                                             how='outer')
            clinical_data = save_subtype_files(subtype_clinical, 'subtype',
                                               cancer_type, outdir)
            cancer_type_fname = 'BRCA_HER2'

        cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0]
        cna = pd.read_csv(cna_file, header=0, index_col=0).T
        genes = '\'' + interesting_genes['Gene']
        genes = genes.loc[cancer_type]
        print genes
        if type(genes) == str:
            print cna[[genes]]
            joined = cna[[genes]].join(clinical_data, how='outer')
        else:
            joined = cna[genes].join(clinical_data, how='outer')
        joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
def main(argv=None):
    if argv is None:
        argv = sys.argv
        input_directory, clinical, outdir, extra_data_dir = get_options()
        clinical_files = os.listdir(clinical)
        clinical_files = util.remove_extraneous_files(clinical_files)

        args = []
        for c in clinical_files:
            cancer_type = util.get_cancer_type(c)
            print cancer_type

            clinical_data = util.get_clinical_data(os.path.join(clinical, c))
            copy_number = glob.glob(
                os.path.join(input_directory, cancer_type + '*.csv'))[0]

            args.append((copy_number, clinical_data, extra_data_dir, outdir))
            # make_zscores(copy_number, clinical_data, extra_data_dir, outdir)
        p = Pool(4)
        p.map(multiprocess_zscores, args)
Ejemplo n.º 14
0
def main(argv=None):
    cnv_dir, structural_breaks, interesting_genes_file, outdir = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    interesting_genes = None
    if interesting_genes_file:
        interesting_genes = pd.read_csv(interesting_genes_file)

    zscore_inputs = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        breaks = glob.glob(
            os.path.join(structural_breaks, '*' + cancer_type + '*'))[0]
        zscore_inputs.append([cnv, breaks, interesting_genes, outdir])
        make_cn_zscores(cnv, breaks, interesting_genes, outdir)

    p = Pool(4)
    p.map(multiprocess_cn_zscores, zscore_inputs)
Ejemplo n.º 15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        infile, indir, outdir = get_options()

        requested_data = read_requested_data(infile)

        files = os.listdir(indir)
        files.remove('.DS_Store')
        files.remove('HG36_HG37')
        output_data = []
        for f in files:
            cancer_type = util.get_cancer_type(f)
            print cancer_type
            zscores = calculate_cox_for_cancer_type(requested_data,
                                                    os.path.join(indir, f),
                                                    outdir)
            output_data.append(zscores)
        df = pd.concat(output_data, axis=1)
        df.to_csv('scratch/zscores_by_codon_2_percent.csv')
def make_zscores(data, clinical, hypermutated_patients, outdir):
  clinical_data = util.get_clinical_data(clinical)
  hypermutated = set(clinical_data.index).intersection(hypermutated_patients['patients'])
  print 'Hypermutated in clinical file:', len(hypermutated)
  clinical_data = clinical_data.drop(hypermutated)

  cancer_type = util.get_cancer_type(data)
  df = mb.prep_mutation_data(data, clinical_data)

  print 'Remaining hypermutated:', set(df.index).intersection(hypermutated)
  num_patients = len(set(clinical_data.index) & set(df.index))
  print 'Number of patients present in both:', num_patients

  clinical_and_data = df.join(clinical_data, how='inner')
  print 'Num patients, other count:', len(df.index)

  outfile = os.path.join(outdir, cancer_type + '_non-hypermutated_zscores.csv')
  formatstring = '{0}, {1}, {2}, {3}, {4}\n'

  zscore_count = 0
  zscore_skipped = 0
  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,num patients,num mutations\n')
    for gene in clinical_and_data:
      if gene not in ('time', 'censor', 'index'): # skip metadata
        num_mutations = clinical_and_data[gene].sum()
        # print gene, num_mutations
        if num_mutations >= MUTATION_PERCENT * num_patients:
          try:
            cox_dict = analysis.do_cox(clinical_and_data.time,
                                       clinical_and_data.censor,
                                       clinical_and_data[gene])
            out.write(formatstring.format(gene, cox_dict['z'], cox_dict['p'], cox_dict['n'], num_mutations))
            zscore_count += 1
          except rpy2.rinterface.RRuntimeError as e:
            print 'WARN: skipped ', gene, ' due to R error'
            zscore_skipped += 1
            continue
        else:
          zscore_skipped += 1
          continue
def make_zscores(copy_number, mutation, clinical, outdir, genes):
    cancer_type = util.get_cancer_type(copy_number)

    clinical_data = util.get_clinical_data(clinical)
    cnv = pd.read_csv(copy_number, index_col=0)

    mutation = mutation_base.prep_mutation_data(mutation, clinical_data)

    for g in genes['Gene']:
        if g not in mutation.columns:
            mutation[g] = 0
            print mutation[g]

    mutations = mutation[genes['Gene']]

    # cox multivariate won't work if there's a quote in the multivar name, so remove it
    gene_names = [x[1:] + '_mutations' for x in genes['Gene']]
    mutations.columns = gene_names

    cnv_by_patient = cnv.transpose()
    clinical_and_cnv = cnv_by_patient.join(clinical_data, how='inner')

    clinical_mutations_and_cnv = clinical_and_cnv.join(mutations, how='inner')

    cox_dicts = {}
    for gene in gene_names:
        plain_gene_name = gene.split('_')[0]
        # little shenanigans to make the names work. CNAs still have a quote, and
        # mutations have a suffix
        clinical_gene = clinical_mutations_and_cnv[[
            '\'' + plain_gene_name, gene, 'time', 'censor'
        ]]
        cox_dict = calculate_cox(clinical_gene, gene)
        cox_dict['mutation_count'] = clinical_gene[gene].sum()

        clinical_gene.to_csv(
            os.path.join(
                outdir, cancer_type + '_' + plain_gene_name +
                '_mutation_and_cna_data.csv'))
        cox_dicts[plain_gene_name] = cox_dict
    return cox_dicts
Ejemplo n.º 18
0
def main(argv=None):
  if argv is None:
    argv = sys.argv
    input_directory, clinical, outdir, extra_clinical_rows_file = get_options()
    clinical_files = os.listdir(clinical)
    clinical_files = util.remove_extraneous_files(clinical_files)

    all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None)

    for c in clinical_files:
      cancer_type = util.get_cancer_type(c)
      extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]]
      print cancer_type
      clinical_data = util.get_clinical_data(os.path.join(clinical, c),
                                             extra_rows=extra_rows)
      print clinical_data

      copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0]
      print copy_number

      make_zscores(copy_number, clinical_data, outdir, extra_rows)
Ejemplo n.º 19
0
def all_cancer_types(copy_number_dir,
                     clinical_dir,
                     outdir,
                     parallel_workers=0):
    copy_number_files = os.listdir(copy_number_dir)
    copy_number_files = util.remove_extraneous_files(copy_number_files)

    args = []
    for c in copy_number_files:
        infile = os.path.join(copy_number_dir, c)
        cancer_type = util.get_cancer_type(infile)
        clinical_file = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        if parallel_workers == 0:
            make_zscores(infile, clinical_file, outdir)
        else:
            args.append((infile, clinical_file, outdir))

    p = multiprocessing.Pool(parallel_workers)
    p.map(multiprocess, args)
def main(argv=None):
    mutation_dir, clinical_dir, structural_breaks, outdir = get_options()
    mut_files = os.listdir(mutation_dir)
    mut_files = util.remove_extraneous_files(mut_files)
    mut_files = [os.path.join(mutation_dir, i) for i in mut_files]

    zscore_inputs = []
    for mut in mut_files:
        if '_' in mut:
            continue
        cancer_type = util.get_cancer_type(mut)
        print cancer_type
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]
        breaks = glob.glob(
            os.path.join(structural_breaks, '*' + cancer_type + '*'))[0]
        zscore_inputs.append([mut, clinical, breaks, outdir])
        # make_zscores(mut, clinical, breaks, outdir)

    p = Pool(4)
    p.map(multiprocess_zscores, zscore_inputs)
Ejemplo n.º 21
0
def main():
  clinical_dir, row_names_file, outdir = get_options()
  files = os.listdir(clinical_dir)
  files = util.remove_extraneous_files(files)
  clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

  row_names = pd.read_csv(row_names_file, header=0)

  for i, row  in row_names.iterrows():
    cancer_type = row['cancer_type']
    print cancer_type
    clinical_file = clinical_by_cancer_type[cancer_type]
    clinical_file = os.path.join(clinical_dir, clinical_file)
    if row['histological_subtype_row'] != 'EXTERNAL':
      make_clinical_data(clinical_file, row['histological_subtype_row'], outdir)
    else:
      subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
      subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
      clinical = util.get_clinical_data(clinical_file)
      subtype_clinical = clinical.join(subtype_data['subtype'], how='outer')
      save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)
def main():
  basedir, clinical_dir, hypermutated_patients, outdir = get_options()

  hypermutated = pd.read_csv(hypermutated_patients, header=None, names=['patients'])

  data_files = os.listdir(basedir)
  data_files = util.remove_extraneous_files(data_files)
  data_files_by_cancer_type = {util.get_cancer_type(f): f for f in data_files}


  clinical_files = os.listdir(clinical_dir)
  clinical_files = util.remove_extraneous_files(clinical_files)
  inputs = []
  for clinical in clinical_files:
    cancer_type = clinical.split('.')[0]
    data_file = data_files_by_cancer_type[cancer_type]

    make_zscores(os.path.join(basedir, data_file),
               os.path.join(clinical_dir, clinical),
               hypermutated,
               outdir)
Ejemplo n.º 23
0
def all_cancer_types(mutation_dir,
                     clinical_dir,
                     outdir,
                     metagene=None,
                     parallel_workers=0):
    mutation_files = os.listdir(mutation_dir)
    mutation_files = util.remove_extraneous_files(mutation_files)
    mutation_files = [os.path.join(mutation_dir, f) for f in mutation_files]

    args = []
    for m in mutation_files:
        cancer_type = util.get_cancer_type(m)
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]
        if parallel_workers == 0:
            calculate_cox(m, clinical, outdir, metagene_file=metagene)
        else:
            args.append([m, clinical, outdir, metagene])

    if parallel_workers > 0:
        p = multiprocessing.Pool(parallel_workers)
        p.map(multiprocess_zscores, args)
Ejemplo n.º 24
0
def main():
    mutation_dir, clinical_dir, outdir = get_options()
    mutation_files = os.listdir(mutation_dir)
    mutation_files = util.remove_extraneous_files(mutation_files)

    results = pd.DataFrame()
    for mut in mutation_files:
        if '_' in mut:
            continue
        cancer_type = util.get_cancer_type(mut)
        print cancer_type
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        clinical_data = pd.read_csv(clinical, index_col=0)
        mutation = mutation_base.prep_mutation_data(
            os.path.join(mutation_dir, mut), clinical_data)
        data = mutation[['\'TP53']].join(clinical_data, how='inner')
        print data

        wt_as = data[data['\'TP53'] == 0]['breaks']
        mut_as = data[data['\'TP53'] != 0]['breaks']

        wt_q = wt_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90])
        mut_q = mut_as.quantile([0.10, 0.25, 0.50, 0.75, 0.90])

        statistic, p = stats.mannwhitneyu(wt_as, mut_as)

        wt_q['cancer_type'] = cancer_type
        wt_q['mut?'] = 'wt'
        mut_q['cancer_type'] = cancer_type
        mut_q['mut?'] = 'mut'
        wt_q['mann-whitney-p'] = p

        results = results.append(wt_q)
        results = results.append(mut_q)

    results = results.set_index(['cancer_type', 'mut?'])
    results.to_csv(os.path.join(outdir, 'breaks_and_p53_quantiles.csv'))
def make_zscores(copy_number, clinical_data, tumor_stage_data_dir, outdir):
    cancer_type = util.get_cancer_type(copy_number)

    df = pd.read_csv(copy_number)
    df_by_patient = df.transpose()
    df_by_patient.columns = df_by_patient.loc['Symbol']
    clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

    tumor_stage_data, tumor_stage_cols = tumor_stage.prep_tumor_stage_data(
        tumor_stage_data_dir, cancer_type)
    if tumor_stage_data is None:
        return

    clinical_and_cnv_and_extra = clinical_and_cnv.join(
        tumor_stage_data[tumor_stage_cols], how='inner')

    outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv')
    header, formatstring = tumor_stage.tumor_stage_output_header_and_format(
        4, tumor_stage_cols)

    with open(outfile, 'w') as out:
        out.write('gene,zscore,pvalue,num patients')
        out.write(header)
        out.write('\n')
        for gene in clinical_and_cnv_and_extra:
            if gene in ['time', 'censor'] + tumor_stage_cols:  # skip metadata
                continue
            if clinical_and_cnv_and_extra[gene].count() > 10:
                cox_dict = analysis.do_multivariate_cox(
                    clinical_and_cnv_and_extra.time,
                    clinical_and_cnv_and_extra.censor,
                    clinical_and_cnv_and_extra[gene],
                    clinical_and_cnv_and_extra[tumor_stage_cols])
                group_zscores = tumor_stage.zscores_for_tumor_stage_cols(
                    cox_dict, tumor_stage_cols)
                out.write(
                    formatstring.format(gene, cox_dict['var-z'],
                                        cox_dict['var-p'], cox_dict['var-n'],
                                        *group_zscores))
Ejemplo n.º 26
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        infile, indir, outdir = get_options()

        requested_data = read_requested_data(infile)
        requested_data = requested_data.groupby('gene')['positions'].apply(
            lambda l: [item for sublist in l for item in sublist])

        files = os.listdir(indir)
        files.remove('.DS_Store')
        files.remove('HG36_HG37')
        output_data = []
        for f in files:
            cancer_type = util.get_cancer_type(f)
            print cancer_type
            zscores = calculate_cox_for_cancer_type(requested_data,
                                                    os.path.join(indir, f),
                                                    outdir)
            output_data.append(zscores)
        df = pd.concat(output_data, axis=1)
        df.to_csv('scratch/zscores_by_gene_hotspot.csv')
def main(argv=None):
    cnv_dir, clinical, interesting_genes_file, outdir = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    interesting_genes = pd.read_csv(interesting_genes_file,
                                    index_col=0,
                                    header=None)

    results = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        clinical_file = glob.glob(
            os.path.join(clinical, '*' + cancer_type + '*'))[0]
        results += make_cn_zscores(cnv, clinical_file, interesting_genes,
                                   outdir)

    results_df = pd.DataFrame(results)
    results_df = results_df.set_index(['cancer_type', 'gene'])
    results_df.to_csv(
        os.path.join(outdir, 'trichotomized_copy_number_zscores.csv'))
def main(argv=None):
    if argv is None:
        argv = sys.argv
        input_directory, clinical, outdir, extra_data_dir = get_options()
        clinical_files = os.listdir(clinical)
        clinical_files = util.remove_extraneous_files(clinical_files)
        extra_data_col = 'Purity_InfiniumPurify'

        for c in clinical_files[3:]:
            cancer_type = util.get_cancer_type(c)
            print cancer_type

            if cancer_type == 'COADREAD':
                extra_data = prep_extra_data(extra_data_dir, 'COAD')
            else:
                extra_data = prep_extra_data(extra_data_dir, cancer_type)
            clinical_data = util.get_clinical_data(os.path.join(clinical, c))

            copy_number = glob.glob(
                os.path.join(input_directory, cancer_type + '*.csv'))[0]

            make_zscores(copy_number, clinical_data, outdir, extra_data,
                         extra_data_col)
Ejemplo n.º 29
0
def make_zscores(copy_number, clinical_data, outdir, extra_clinical_rows=None):
  df = pd.read_csv(copy_number)
  df_by_patient = df.transpose()
  df_by_patient.columns = df_by_patient.loc['Symbol']
  clinical_and_cnv = df_by_patient.join(clinical_data, how='inner')

  cancer_type = util.get_cancer_type(copy_number)
  formatstring = '{0}, {1}, {2}, {3}, {4}, {5}\n'
  outfile = os.path.join(outdir, cancer_type + '_extra_clinical_zscores.csv')

  with open(outfile, 'w') as out:
    out.write('gene,zscore,pvalue,clinical-row-zscore,clinical-row-pvalue,num patients\n')
    for gene in clinical_and_cnv:
      if gene not in ('time', 'censor'): # skip metadata
        if clinical_and_cnv[gene].count() > 10:
          cox_dict = analysis.do_metagene_cox(clinical_and_cnv.time,
                                              clinical_and_cnv.censor,
                                              clinical_and_cnv[gene],
                                              clinical_and_cnv[extra_clinical_rows[0]].rename('metagene'))
          out.write(formatstring.format(
                        gene, cox_dict['z'], cox_dict['p'],
                        cox_dict['metagene-z'], cox_dict['metagene-p'],
                        cox_dict['n']))
Ejemplo n.º 30
0
def make_cnv_zscores(copy_number, clinical, gene_list):
    cancer_type = util.get_cancer_type(copy_number)

    cna = pd.read_csv(copy_number)
    cna_by_patient = cna.transpose()
    cna_by_patient.columns = cna_by_patient.loc['Symbol']
    cna_by_patient_gene_list_only = cna_by_patient[gene_list]

    clinical_data = util.get_clinical_data(clinical)
    clinical_and_cnv = cna_by_patient_gene_list_only.join(clinical_data,
                                                          how='inner')

    results = pd.DataFrame()
    for gene in clinical_and_cnv:
        if gene in ['time', 'censor']:
            continue
        cox_dict = analysis.do_cox(clinical_and_cnv.time,
                                   clinical_and_cnv.censor,
                                   clinical_and_cnv[gene])
        cox_dict['cancer_type'] = cancer_type
        cox_dict['gene'] = gene
        results = results.append(cox_dict, ignore_index=True)
    return results