def main():
    basedir, clinical_dir, outdir = get_options()

    data_files = os.listdir(basedir)
    data_files = util.remove_extraneous_files(data_files)
    data_files_by_cancer_type = {
        util.get_cancer_type(f): f
        for f in data_files
    }

    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    inputs = []
    for clinical in clinical_files:
        cancer_type = clinical.split('.')[0]
        data_file = data_files_by_cancer_type[cancer_type]

        # make_zscores(os.path.join(basedir, data_file),
        #            os.path.join(clinical_dir, clinical),
        #            outdir)
        inputs.append((os.path.join(basedir, data_file),
                       os.path.join(clinical_dir, clinical), outdir))

    p = Pool(10)
    p.map(multiprocess_zscores, inputs)
Esempio n. 2
0
def main():
    clinical_dir, output_dir, header_file = get_options()
    headers = pd.read_csv(header_file, index_col=0, header=None)
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        purity_header = headers.get_value(cancer_type, 1)

        clinical = util.get_clinical_data(clinical_path,
                                          extra_rows=[purity_header])

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
Esempio n. 3
0
def main():
  indir, outdir = get_options()
  directories = os.listdir(indir)
  print indir, outdir
  directories = util.remove_extraneous_files(directories)
  directories.remove('output')
  for d in directories[2:]:
    print d
    cna_glob = os.path.join(indir, d, '*.cnv.*')
    print cna_glob
    cna_file = glob.glob(cna_glob)[0]
    cna = pd.read_csv(cna_file, index_col=0, sep=util.get_sep_from_filename(cna_file))

    clinical_glob = os.path.join(indir, d, '*clinical.*')
    clinical_file = glob.glob(clinical_glob)[0]
    clinical = pd.read_csv(clinical_file, sep=util.get_sep_from_filename(clinical_file), index_col=0)
    clinical = clinical[['Time', 'Censor']]

    mut_glob = os.path.join(indir, d, '*mutations*')
    mut_file = glob.glob(mut_glob)[0]
    mut = pd.read_csv(mut_file, sep=util.get_sep_from_filename(mut_file), low_memory=False)
    mutations = prep_mutations(d, mut, clinical)

    do_single_cancer_type_cna(d, clinical, cna, outdir)
    do_single_cancer_type_mutation(d, clinical, mutations, outdir)
def main(argv=None):
    mutation_dir, clinical_dir, outdir = get_options()

    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
    p = Pool(1)

    args = []
    pancan = {}
    for clinical in clinical_files:
        cancer_type = util.get_cancer_type(clinical)
        print cancer_type
        mutation = glob.glob(
            os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

        clinical_data = util.get_clinical_data(clinical)
        #args.append((mutation, clinical_data, outdir))
        pancan[cancer_type] = calculate_cox(mutation, clinical_data, outdir)

    #print args
    #p.map(multiprocess_zscores, args)
    pancan_df = pd.DataFrame(pancan)
    pancan_df = pancan_df.transpose()
    pancan_df.to_csv(os.path.join(outdir, 'pancan.csv'))
def main(argv=None):
    cn_change_size_dir, clinical_dir, outdir = get_options()
    input_files = os.listdir(cn_change_size_dir)
    input_files = util.remove_extraneous_files(input_files)
    input_files = [os.path.join(cn_change_size_dir, i) for i in input_files]

    zscore_inputs = []
    results = []
    for input_file in input_files:
        cancer_type = os.path.split(input_file)[1].split('_')[0]
        gene = os.path.split(input_file)[1].split('_')[1].split('.')[0]
        print cancer_type, gene

        # zscore_inputs.append([input_file, cancer_type, gene])
        results.append(multiprocess_zscores([input_file, cancer_type, gene]))

    #p = Pool(4)
    #results = p.map(multiprocess_zscores, zscore_inputs)
    with open(os.path.join(outdir, 'cox_any_change_results.csv'), 'w') as out:
        formatstr = '{},{},{},{}\n'
        out.write('Cancer Type,Gene,Z Score,Count\n')
        for cox_dict in results:
            cancer_type_gene = cox_dict.keys()[0]
            print cancer_type_gene
            print cox_dict[cancer_type_gene]
            d = cox_dict[cancer_type_gene]
            out.write(
                formatstr.format(
                    cancer_type_gene.split('_')[0],
                    cancer_type_gene.split('_')[1], d['z'],
                    d['any_change_count']))
def main():
    clinical_dir, output_dir, extra_data_dir = get_options()
    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)

    zscore_data = {}
    for f in clinical_files:
        clinical_path = os.path.join(clinical_dir, f)
        cancer_type = util.get_cancer_type(f)
        if cancer_type == 'COADREAD':
            extra_data = prep_extra_data(extra_data_dir, 'COAD')
        else:
            extra_data = prep_extra_data(extra_data_dir, cancer_type)

        clinical = util.get_clinical_data(clinical_path)
        clinical = clinical.join(extra_data)
        purity_header = 'Purity_InfiniumPurify'

        cox_dict = analysis.do_cox(clinical.time, clinical.censor,
                                   clinical[purity_header])
        zscore_data[cancer_type] = cox_dict

        purity_mean = clinical[purity_header].mean()
        print purity_mean
        clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean,
                                        0, 1)
        analysis.do_km(cancer_type, clinical.time, clinical.censor,
                       clinical.km_split, output_dir)

    out_df = pd.DataFrame(zscore_data).transpose()
    out_df.to_csv(
        os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def main(argv=None):
  mutation_dir, clinical_dir, outdir, univariate_output = get_options()

  clinical_files = os.listdir(clinical_dir)
  clinical_files = util.remove_extraneous_files(clinical_files)
  clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
  p = Pool(16)

  args = []
  for clinical in clinical_files:
    cancer_type = util.get_cancer_type(clinical)
    print cancer_type
    mutation = glob.glob(os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

    univariate_file = None
    if univariate_output:
      univariate_file = glob.glob(os.path.join(univariate_output, cancer_type, cancer_type + '.zscores.out.csv'))[0]
      print univariate_file

    clinical_data = util.get_clinical_data(clinical)
    cancer_type_outdir = os.path.join(outdir, cancer_type)
    if not os.path.isdir(cancer_type_outdir):
      os.makedirs(cancer_type_outdir)
    args.append((mutation, clinical_data, cancer_type_outdir, univariate_file))
    # calculate_cox(mutation, clinical_data, cancer_type_outdir, univariate_file=univariate_file)

  print args
  p.map(multiprocess_zscores, args)
def main(argv=None):
    mutation_dir, clinical_dir, outdir, tumor_stage_dir = get_options()

    clinical_files = os.listdir(clinical_dir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files]
    p = Pool(16)

    args = []
    for clinical in clinical_files:
        cancer_type = util.get_cancer_type(clinical)
        print cancer_type
        mutation = glob.glob(
            os.path.join(mutation_dir, '*' + cancer_type + '*'))[0]

        tumor_stage = os.path.join(tumor_stage_dir,
                                   cancer_type + '_clinical.csv')
        if not os.path.isfile(tumor_stage):
            continue

        clinical_data = util.get_clinical_data(clinical)
        cancer_type_outdir = os.path.join(outdir, cancer_type)
        if not os.path.isdir(cancer_type_outdir):
            os.makedirs(cancer_type_outdir)
        args.append((mutation, clinical_data, tumor_stage, cancer_type_outdir))
        # calculate_cox(mutation, clinical_data, tumor_stage, cancer_type_outdir)

    p.map(multiprocess_zscores, args)
Esempio n. 9
0
def main():
  indir, outdir, split_files = get_options()
  files = os.listdir(os.path.join(indir, 'cnas'))
  files = util.remove_extraneous_files(files)

  criteria_list = [stouffer_sig_and_two_zscore_sig]

  for criteria in criteria_list:
    if not split_files:
      outfile_c = open(os.path.join(outdir, criteria.__name__ + '.out.csv'), 'w')
      outfile_m = outfile_c
    for f in files:
      cancer_type = f.split('_')[0]
      if split_files:
        outfile_c = os.path.join(outdir, cancer_type + '_CNA_.criteria_met.out.csv')
        outfile_m = os.path.join(outdir, cancer_type + '_MUT_.criteria_met.out.csv')

      cna_cancer_type_criteria_met = apply_criteria(os.path.join(indir, 'cnas', f),
                                                    criteria, 'cna')
      cna_cancer_type_criteria_met.index = 'cna_' + cna_cancer_type_criteria_met.index
      cna_cancer_type_criteria_met.to_csv(outfile_c, index_label='CNA_'+cancer_type)

      mut_cancer_type_criteria_met = apply_criteria(os.path.join(indir, 'mutations', f),
                                                   criteria, 'mut')
      print f, mut_cancer_type_criteria_met.index
      mut_cancer_type_criteria_met.index = 'mut_' + mut_cancer_type_criteria_met.index
      mut_cancer_type_criteria_met.to_csv(outfile_m, index_label='MUT_'+cancer_type)
  if not split_files:
    outfile.close()
Esempio n. 10
0
def main():
    copy_number_loc, clinical, outdir = get_options()
    cnas = os.listdir(copy_number_loc)
    cnas = util.remove_extraneous_files(cnas)

    results = pd.DataFrame()
    for c in cnas:
        cancer_type = util.get_cancer_type(c)
        print cancer_type

        clinical_file = glob.glob(
            os.path.join(clinical, '*' + cancer_type + '*.txt'))[0]
        clin = util.get_clinical_data(clinical_file)

        patient_breaks = count_breaks(os.path.join(copy_number_loc, c))
        patient_breaks = patient_breaks.reset_index()
        patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample',
                                                  cancer_type)
        patient_breaks = util.add_identifier_column(patient_breaks, 'Sample')
        patient_breaks = patient_breaks.set_index('identifier')
        patient_breaks = patient_breaks.drop('Sample', axis=1)

        breaks_and_clin = patient_breaks.join(clin, how='inner')
        breaks_and_clin.to_csv(
            os.path.join(outdir, cancer_type + '_breaks.csv'))
        cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor,
                              breaks_and_clin.breaks)
        cox['cancer_type'] = cancer_type
        results = results.append(cox, ignore_index=True)

    results.to_csv(os.path.join(outdir, 'cox_results.csv'))
Esempio n. 11
0
def main(argv=None):
    cnv_dir, mutation_dir, clinical_dir, gene_list, outdir = get_options()
    gene_list = pd.read_csv(gene_list, header=None)
    gene_list = '\'' + gene_list[0]

    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    mutation_results = pd.DataFrame()
    cnv_results = pd.DataFrame()
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        print cancer_type
        mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0]
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        cnv_results = cnv_results.append(
            make_cnv_zscores(cnv, clinical, gene_list))
        mutation_results = mutation_results.append(
            make_mutation_zscores(mutation, clinical, gene_list))

    mutation_results.to_csv(os.path.join(
        outdir, 'mutation_zscores_w_hazards_fig1.csv'),
                            index=False)
    cnv_results.to_csv(os.path.join(outdir, 'cnv_zscores_w_hazards_fig1.csv'),
                       index=False)
def main(argv=None):
  cnv_dir, rna, mutation_dir, clinical_dir, outdir, input_file = get_options()
  cnv_files = os.listdir(cnv_dir)
  cnv_files = util.remove_extraneous_files(cnv_files)
  cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

  interesting_genes = pd.read_csv(input_file, comment='#')
  interesting_genes['Gene'] = '\'' + interesting_genes['Gene']

  zscore_inputs = []
  corr_results = []
  for cnv in cnv_files:
    cancer_type = util.get_cancer_type(cnv)
    cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type]
    if len(cancer_type_genes) == 0:
      continue
    print cancer_type
    print cancer_type_genes
    rnaseq = glob.glob(os.path.join(rna, cancer_type + '*'))[0]
    mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0]
    clinical = glob.glob(os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]


    zscore_inputs.append([cnv, rnaseq, mutation, clinical, outdir, cancer_type_genes])
    # corr_results.append(multiprocess_data([cnv, rnaseq, mutation, clinical, outdir, cancer_type_genes]))

  p = Pool(4)
  corr_results = p.map(multiprocess_data, zscore_inputs)
  df = pd.concat(corr_results, verify_integrity=True, axis=1)
  print df
  df.to_csv(os.path.join(outdir, 'corr_results.csv'))
Esempio n. 13
0
def all_cancer_types(copy_number_dir,
                     annotation_file,
                     outdir,
                     parallel_workers=0):
    copy_number_files = os.listdir(copy_number_dir)
    copy_number_files = util.remove_extraneous_files(copy_number_files)

    # returns a dataframe indexed by gene name, with chr number and txstart
    args = []
    annotation_data = process_annotation_file(annotation_file)
    for c in copy_number_files:
        infile = os.path.join(copy_number_dir, c)
        type_name = os.path.basename(infile).split('.')[0]
        outfile = os.path.join(outdir, type_name + '.cnv.csv')

        if parallel_workers == 0:
            # returns a dict of patient_ids => lists of interval trees containing range data for each chromosome
            patient_data = process_input_file(infile)
            process_and_write_data(outfile, annotation_data, patient_data)
        else:
            args.append((infile, outfile, annotation_data))

    if parallel_workers > 0:
        p = multiprocessing.Pool(parallel_workers)
        p.map(multiprocess, args)
def main():
  clinical_dir, tumor_groups = get_options()
  files = os.listdir(clinical_dir)
  files = util.remove_extraneous_files(files)

  for f in files:
    cancer_type = util.get_cancer_type(f)
    clinical_file = os.path.join(clinical_dir, f)
    tumor_group_file = os.path.join(tumor_groups, cancer_type + '.csv')
    count_tumor_groups(clinical_file, tumor_group_file)
def main():
  basedir, clinical_dir, hypermutated_patients, outdir = get_options()

  hypermutated = pd.read_csv(hypermutated_patients, header=None, names=['patients'])

  data_files = os.listdir(basedir)
  data_files = util.remove_extraneous_files(data_files)
  data_files_by_cancer_type = {util.get_cancer_type(f): f for f in data_files}


  clinical_files = os.listdir(clinical_dir)
  clinical_files = util.remove_extraneous_files(clinical_files)
  inputs = []
  for clinical in clinical_files:
    cancer_type = clinical.split('.')[0]
    data_file = data_files_by_cancer_type[cancer_type]

    make_zscores(os.path.join(basedir, data_file),
               os.path.join(clinical_dir, clinical),
               hypermutated,
               outdir)
Esempio n. 16
0
def main(argv=None):
    cnv_dir, mutation_dir, clinical_dir, outdir = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    zscore_inputs = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0]
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]
        zscore_inputs.append([cnv, mutation, clinical, outdir])

    p = Pool(4)
    p.map(multiprocess_zscores, zscore_inputs)
def main():
    indir, clinical_dir, outdir = get_options()
    files = os.listdir(indir)
    files = util.remove_extraneous_files(files)

    for f in files:
        cancer_type = get_cbioportal_cancer_type(f)
        print cancer_type
        clinical_file = os.path.join(clinical_dir,
                                     cancer_type + '_clinical.csv')
        cancer_type_outdir = os.path.join(outdir, cancer_type)
        if not os.path.isdir(cancer_type_outdir):
            os.makedirs(cancer_type_outdir)
            clinical = get_cbioportal_clinical(clinical_file)
            calculate_zscores_for_file(os.path.join(indir, f), clinical,
                                       cancer_type_outdir, cancer_type)
def main(argv=None):
  if argv is None:
    argv = sys.argv
    input_directory, clinical_directory, outdir = get_options()

    cnv_files = os.listdir(input_directory)
    cnv_files = util.remove_extraneous_files(cnv_files)
    for cnv in cnv_files:
      cancer_type = get_cbioportal_cancer_type(cnv)
      print cancer_type
      clinical_file = glob.glob(os.path.join(clinical_directory, '*' + cancer_type + '*'))[0]

      outglob = glob.glob(os.path.join(outdir, cancer_type + '*'))
      if len(outglob) == 0:
        print cancer_type
        make_zscores(os.path.join(input_directory, cnv), clinical_file, outdir)
def main(argv=None):
    indir, clinical_dir, outdir = get_options()

    files = os.listdir(indir)
    files = util.remove_extraneous_files(files)
    for copy_number in files:
        cancer_type = get_icgc_cancer_type(copy_number)
        print cancer_type
        clinical_file = os.path.join(clinical_dir, cancer_type + '.csv')

        relevant_clinical = pd.read_csv(clinical_file,
                                        index_col=0,
                                        low_memory=False)[['Time', 'Censor'
                                                           ]].astype(float)
        make_zscores(os.path.join(indir, copy_number), relevant_clinical,
                     outdir)
def main(argv=None):
    cnv_dir, mutation_dir, clinical_dir, outdir, input_file = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    interesting_genes = pd.read_csv(input_file, comment='#')
    print interesting_genes
    interesting_genes['Gene'] = '\'' + interesting_genes['Gene']

    zscore_inputs = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        cancer_type_genes = interesting_genes[interesting_genes['Cancer Type']
                                              == cancer_type]
        if len(cancer_type_genes) == 0:
            continue
        print cancer_type
        print cancer_type_genes
        mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0]
        clinical = glob.glob(
            os.path.join(clinical_dir, '*' + cancer_type + '*'))[0]

        zscore_inputs.append(
            [cnv, mutation, clinical, outdir, cancer_type_genes])
        #multiprocess_zscores([cnv, mutation, clinical, outdir, cancer_type_genes])

    p = Pool(4)
    results = p.map(multiprocess_zscores, zscore_inputs)
    print results
    with open(os.path.join(outdir, 'cox_results.csv'), 'w') as out:
        formatstr = '{},{},{},{},{},{},{},{}\n'
        out.write(
            'Cancer Type,Gene,CNA Z Score, CNA P value, Mutation Z score, Mutation P Value, Mutation Count, n\n'
        )
        for coxs in results:
            cancer_type = coxs.keys()[0]
            print cancer_type
            for gene, cox_dict in coxs[cancer_type].iteritems():
                print gene, cox_dict
                out.write(
                    formatstr.format(cancer_type, gene, cox_dict['var-z'],
                                     cox_dict['var-p'],
                                     cox_dict[gene + '_mutations-z'],
                                     cox_dict[gene + '_mutations-p'],
                                     cox_dict['mutation_count'],
                                     cox_dict['var-n']))
Esempio n. 21
0
def main():
    indir, outdir = get_options()
    clinical_files = os.listdir(indir)
    clinical_files = util.remove_extraneous_files(clinical_files)
    stage_row = 'patient.stage_event.pathologic_stage'

    for clinical_f in clinical_files:
        f = os.path.join(indir, clinical_f)
        cancer_type = util.get_cancer_type(clinical_f)
        stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type]
        if stage_row:
            clinical = util.get_clinical_data(f,
                                              extra_rows=[stage_row],
                                              extra_rows_numeric=False)
            clinical[stage_row] = clinical[stage_row].str.strip()
            print cancer_type
            print clinical[stage_row].value_counts()
def main(argv=None):
  cnv_dir, clinical_dir, outdir, input_file = get_options()
  cnv_files = os.listdir(cnv_dir)
  cnv_files = util.remove_extraneous_files(cnv_files)
  cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

  interesting_genes = pd.read_csv(input_file, comment='#')
  interesting_genes['Gene'] = '\'' + interesting_genes['Gene']

  zscore_inputs = []
  for cnv in cnv_files:
    cancer_type = util.get_cancer_type(cnv)
    cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type]
    if len(cancer_type_genes) == 0:
      continue

    clinical = glob.glob(os.path.join(clinical_dir, cancer_type + '*'))[0]
    multiprocess_copy_number_changes([cnv, clinical, outdir, cancer_type_genes])
def count_codons(data, outdir):
    files = os.listdir(data)
    files = util.remove_extraneous_files(files)
    files.remove('HG36_HG37')

    outdata = []
    ncbi_outdata = []
    for f in files:
        file_name = os.path.join(data, f)
        cancer_type = util.get_cancer_type(file_name)
        codon_counts = count_codons_in_file(file_name)
        outdata.append(codon_counts)
    df = pd.concat(outdata, axis=1, verify_integrity=True)
    df['sum'] = df.sum(axis=1)
    df.to_csv('codon_counts.csv',
              index_label=[
                  'Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'
              ])
Esempio n. 24
0
def main():
    clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options(
    )
    files = os.listdir(clinical_dir)
    files = util.remove_extraneous_files(files)
    clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

    row_names = pd.read_csv(row_names_file, header=0)

    interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1)

    for i, row in row_names.iterrows():
        cancer_type = row['cancer_type']
        cancer_type_fname = cancer_type
        print cancer_type
        clinical_file = clinical_by_cancer_type[cancer_type]
        clinical_file = os.path.join(clinical_dir, clinical_file)

        if row['histological_subtype_row'] != 'EXTERNAL':
            clinical_data = make_clinical_data(clinical_file,
                                               row['histological_subtype_row'],
                                               outdir)
        else:
            subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
            # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
            clinical = util.get_clinical_data(clinical_file)
            subtype_clinical = clinical.join(subtype_data['subtype'],
                                             how='outer')
            clinical_data = save_subtype_files(subtype_clinical, 'subtype',
                                               cancer_type, outdir)
            cancer_type_fname = 'BRCA_HER2'

        cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0]
        cna = pd.read_csv(cna_file, header=0, index_col=0).T
        genes = '\'' + interesting_genes['Gene']
        genes = genes.loc[cancer_type]
        print genes
        if type(genes) == str:
            print cna[[genes]]
            joined = cna[[genes]].join(clinical_data, how='outer')
        else:
            joined = cna[genes].join(clinical_data, how='outer')
        joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
Esempio n. 25
0
def main():
    indir, clinical_dir, outdir, hgnc_file = get_options()
    files = os.listdir(indir)
    files = util.remove_extraneous_files(files)

    hgnc = pd.read_csv(hgnc_file, low_memory=False)
    hgnc = hgnc[['Approved Symbol', 'Ensembl ID(supplied by Ensembl)']]
    hgnc.columns = ['Symbol', 'Ensembl ID']
    hgnc.set_index('Ensembl ID', inplace=True)
    hgnc['Symbol'] = '\'' + hgnc['Symbol']

    for f in files:
        cancer_type = get_icgc_cancer_type(f)
        print cancer_type
        clinical_file = os.path.join(clinical_dir, cancer_type + '.csv')
        cancer_type_outdir = os.path.join(outdir, cancer_type)
        if not os.path.isdir(cancer_type_outdir):
            os.makedirs(cancer_type_outdir)
            calculate_zscores_for_file(os.path.join(indir, f), clinical_file,
                                       cancer_type_outdir, hgnc)
Esempio n. 26
0
def main(argv=None):
    cnv_dir, structural_breaks, interesting_genes_file, outdir = get_options()
    cnv_files = os.listdir(cnv_dir)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files]

    interesting_genes = None
    if interesting_genes_file:
        interesting_genes = pd.read_csv(interesting_genes_file)

    zscore_inputs = []
    for cnv in cnv_files:
        cancer_type = util.get_cancer_type(cnv)
        breaks = glob.glob(
            os.path.join(structural_breaks, '*' + cancer_type + '*'))[0]
        zscore_inputs.append([cnv, breaks, interesting_genes, outdir])
        make_cn_zscores(cnv, breaks, interesting_genes, outdir)

    p = Pool(4)
    p.map(multiprocess_cn_zscores, zscore_inputs)
def main(argv=None):
    if argv is None:
        argv = sys.argv
        input_directory, clinical, outdir, extra_data_dir = get_options()
        clinical_files = os.listdir(clinical)
        clinical_files = util.remove_extraneous_files(clinical_files)

        args = []
        for c in clinical_files:
            cancer_type = util.get_cancer_type(c)
            print cancer_type

            clinical_data = util.get_clinical_data(os.path.join(clinical, c))
            copy_number = glob.glob(
                os.path.join(input_directory, cancer_type + '*.csv'))[0]

            args.append((copy_number, clinical_data, extra_data_dir, outdir))
            # make_zscores(copy_number, clinical_data, extra_data_dir, outdir)
        p = Pool(4)
        p.map(multiprocess_zscores, args)
def main(argv=None):
    input_directory, clinical_directory, gene_file, outdir = get_options()

    gene_list = pd.read_csv(gene_file, header=None)[0].values

    cnv_files = os.listdir(input_directory)
    cnv_files = util.remove_extraneous_files(cnv_files)
    cnv_file = [f for f in cnv_files if 'METABRIC' in f]
    cnv = cnv_file[0]

    cancer_type = get_cbioportal_cancer_type(cnv)
    clinical_file = glob.glob(
        os.path.join(clinical_directory, '*' + cancer_type + '*'))[0]

    print cancer_type
    results = make_zscores(os.path.join(input_directory, cnv), clinical_file,
                           gene_list)

    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('gene')
    results_df.to_csv(os.path.join(outdir, 'metabric_copy_number.csv'))
Esempio n. 29
0
def main(argv=None):
  if argv is None:
    argv = sys.argv
    input_directory, clinical, outdir, extra_clinical_rows_file = get_options()
    clinical_files = os.listdir(clinical)
    clinical_files = util.remove_extraneous_files(clinical_files)

    all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None)

    for c in clinical_files:
      cancer_type = util.get_cancer_type(c)
      extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]]
      print cancer_type
      clinical_data = util.get_clinical_data(os.path.join(clinical, c),
                                             extra_rows=extra_rows)
      print clinical_data

      copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0]
      print copy_number

      make_zscores(copy_number, clinical_data, outdir, extra_rows)
Esempio n. 30
0
def main():
  clinical_dir, row_names_file, outdir = get_options()
  files = os.listdir(clinical_dir)
  files = util.remove_extraneous_files(files)
  clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files}

  row_names = pd.read_csv(row_names_file, header=0)

  for i, row  in row_names.iterrows():
    cancer_type = row['cancer_type']
    print cancer_type
    clinical_file = clinical_by_cancer_type[cancer_type]
    clinical_file = os.path.join(clinical_dir, clinical_file)
    if row['histological_subtype_row'] != 'EXTERNAL':
      make_clinical_data(clinical_file, row['histological_subtype_row'], outdir)
    else:
      subtype_data = prep_BRCA_data(row['external_file'], cancer_type)
      subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv'))
      clinical = util.get_clinical_data(clinical_file)
      subtype_clinical = clinical.join(subtype_data['subtype'], how='outer')
      save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)