def prep_data(mutation, clinical_data, key):
  df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str)
  cancer_type = util.get_cancer_type(mutation)

  # remove column headers from combined mutation sheet
  df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
  df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()

  number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique().size
  print 'Number of total sequenced barcodes:   ', number_barcodes_in_mutation_data
  df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type)

  df['VAF'] = variant_allele_freq.calculate_vaf(df, key.loc[cancer_type])

  # Reduce mutation data to patients that also have clinical data
  df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')
  df = df.join(clinical_data, on='identifier', how='inner')
  df.set_index([u'Hugo_Symbol', 'identifier'], inplace=True)

  # symmetrically filter clinical data down to patients that were also sequenced
  unique_patients =  df.index.get_level_values('identifier').unique()
  unique_patients_df = pd.DataFrame(unique_patients, index=unique_patients)
  clinical_data_with_sequenced_patients = clinical_data.join(unique_patients_df, how='inner')
  num_patients = clinical_data_with_sequenced_patients.shape[0]
  print 'Number of patients with sequence and clinical data: ', num_patients
  return df, clinical_data_with_sequenced_patients, num_patients
Esempio n. 2
0
def main():
    copy_number_loc, clinical, outdir = get_options()
    cnas = os.listdir(copy_number_loc)
    cnas = util.remove_extraneous_files(cnas)

    results = pd.DataFrame()
    for c in cnas:
        cancer_type = util.get_cancer_type(c)
        print cancer_type

        clinical_file = glob.glob(
            os.path.join(clinical, '*' + cancer_type + '*.txt'))[0]
        clin = util.get_clinical_data(clinical_file)

        patient_breaks = count_breaks(os.path.join(copy_number_loc, c))
        patient_breaks = patient_breaks.reset_index()
        patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample',
                                                  cancer_type)
        patient_breaks = util.add_identifier_column(patient_breaks, 'Sample')
        patient_breaks = patient_breaks.set_index('identifier')
        patient_breaks = patient_breaks.drop('Sample', axis=1)

        breaks_and_clin = patient_breaks.join(clin, how='inner')
        breaks_and_clin.to_csv(
            os.path.join(outdir, cancer_type + '_breaks.csv'))
        cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor,
                              breaks_and_clin.breaks)
        cox['cancer_type'] = cancer_type
        results = results.append(cox, ignore_index=True)

    results.to_csv(os.path.join(outdir, 'cox_results.csv'))
Esempio n. 3
0
def process_input_file(input_file):
    df = pd.read_csv(input_file, sep='\t')
    df = util.maybe_clear_non_01s(df, u'Sample', input_file)
    df = util.add_identifier_column(df, u'Sample')

    patient_data = {}
    for index, row in df.iterrows():
        identifier = row['identifier']
        chromosome = row['Chromosome']
        start = row['Start']
        end = row['End']
        copy_number = row['Segment_Mean']

        # Add the new patient to the data dict, initializing the chromosome list
        if not identifier in patient_data:
            # note the length of the list is 24, so we can use chromosome number (and not worry about index 0)
            patient_data[identifier] = [0] * 24

        # Initialize the interval tree for the new chromosome for this patient
        if patient_data[identifier][chromosome] == 0:
            patient_data[identifier][chromosome] = IntervalTree()

        # Add the range and copy number in this row to the correct patient_data/chromosome location
        # Note the interval tree implementation uses half close intervals, but copy number data
        # uses closed intervals, so we add 1 to the end to ensure our intervaltree matches the data.
        patient_data[identifier][chromosome][start:end + 1] = copy_number
    return patient_data
def make_corrs(copy_number, rnaseq, mutation, clinical, outdir, genes):
  cancer_type = util.get_cancer_type(copy_number)

  clinical_data = util.get_clinical_data(clinical)
  cnv = pd.read_csv(copy_number, index_col=0)
  cnv_by_patient = cnv.transpose()


  rnaseq =  pd.read_csv(rnaseq, low_memory=False, sep='\t')
  rnaseq = rnaseq.drop([0])
  rnaseq = rnaseq.set_index('Hybridization REF').astype(np.float)
  rnaseq = rnaseq.transpose().reset_index()
  rnaseq = util.maybe_clear_non_01s(rnaseq, 'index', cancer_type)
  rnaseq = util.add_identifier_column(rnaseq, 'index')
  rnaseq_clean = rnaseq.set_index('identifier').drop('index', 1).astype(np.float)
  rnaseq_log2 = rnaseq_clean.apply(np.log2)
  rnaseq_clipped_log2 = np.clip(rnaseq_log2, 0, np.inf)
  rna_cnv = cnv_by_patient[genes['Gene']].join(rnaseq_clipped_log2, how='inner')

  mutation = mutation_base.prep_mutation_data(mutation, clinical_data)
  print mutation.index

  included_patients = set(list(mutation.index)) & set(list(rna_cnv.index))

  rna_cnv = rna_cnv.loc[included_patients]

  rna_cnv.T.to_csv(os.path.join(outdir, cancer_type + '_cnv_rnaseq_data.csv'))

  corr_dict = {}
  for gene in genes['Gene']:
    corr = rna_cnv.corrwith(rna_cnv[gene]).drop(genes['Gene'])
    corr_dict[cancer_type + '_' + gene] = corr

  return pd.DataFrame(corr_dict)
def prep_data(mutation, clinical_data):
  df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str)
  cancer_type = util.get_cancer_type(mutation)

  # remove column headers from combined mutation sheet
  df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
  df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()

  number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique().size
  print 'Number of total sequenced barcodes:   ', number_barcodes_in_mutation_data
  df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type)
  df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')

  # include only nonsilent mutations
  non_silent = df.where(df[u'Variant_Classification'] != 'Silent')
  df = non_silent.dropna(subset=[u'Variant_Classification'])

  df = df.reset_index()
  df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str)

  gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
  gene_mutation_df.index.set_names(['Hugo_Symbol', 'patient'], inplace=True)
  gene_mutation_df = gene_mutation_df.reset_index()
  gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated')

  return gene_patient_mutations.transpose()
Esempio n. 6
0
def prep_data(mutation, key):
    df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str)
    cancer_type = util.get_cancer_type(mutation)

    # remove column headers from combined mutation sheet
    df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
    df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str)
    df = df[df[u'Hugo_Symbol'].isin(COMMONLY_MUTATED)]

    df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()

    number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique(
    ).size
    print 'Number of total sequenced barcodes:   ', number_barcodes_in_mutation_data
    df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type)
    df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')

    # include only nonsilent mutations
    non_silent = df.where(df[u'Variant_Classification'] != 'Silent')
    df = non_silent.dropna(subset=[u'Variant_Classification'])
    df = df.reset_index()

    df['VAF'] = calculate_vaf(df, key.loc[cancer_type])
    # use the largest VAF
    df = df.groupby(['Hugo_Symbol', 'identifier']).max()
    df = df.reset_index()
    pivoted = df.pivot(index='identifier', columns='Hugo_Symbol', values='VAF')

    minimum_vaf_count = MUTATION_PERCENT * number_barcodes_in_mutation_data
    enough_patients = pivoted.count() >= minimum_vaf_count
    too_few_patients = enough_patients[~enough_patients].index.values
    print 'Genes with too few patients:', too_few_patients
    pivoted = pivoted.drop(too_few_patients, axis=1)
    return pivoted
Esempio n. 7
0
def get_metagene_data(metagene_file, cancer_type):
    rnaseq_glob = os.path.join('rnaseq', cancer_type + '*.txt')
    rnaseq_file = glob.glob(rnaseq_glob)
    assert (len(rnaseq_file) == 1)
    rnaseq = pd.read_csv(rnaseq_file[0],
                         sep='\t',
                         low_memory=False,
                         index_col=0)

    metagene_list = pd.read_csv(metagene_file)
    metagene_df = rnaseq.loc[metagene_list['RNASeq']].astype(float)
    metagene_df = metagene_df.transpose().reset_index()
    metagene_df = util.maybe_clear_non_01s(metagene_df, 'index', cancer_type)
    metagene_df = util.add_identifier_column(metagene_df,
                                             'index').drop('index', axis=1)
    metagene_df = metagene_df.set_index('identifier')
    metagene_df = metagene_df.transpose()

    # now we normalize. take the mean of the base 2 log, then subtract that from each row.
    # then take the average across genes to get the metagene value
    metagene_df_log2 = metagene_df.apply(np.log2)
    metagene_clipped = np.clip(metagene_df_log2, 0, np.inf)
    metagene_means = metagene_clipped.mean(axis=1)

    metagene_normed = metagene_clipped.sub(metagene_means, axis=0)
    metagene = metagene_normed.mean()
    metagene.name = 'metagene'
    return metagene
def prep_extra_data(extra_data_directory, cancer_type):
    extra_data_path = os.path.join(extra_data_directory, cancer_type + '.txt')
    extra_data = pd.read_csv(extra_data_path, sep='\t', na_values=['-'])
    extra_data = util.maybe_clear_non_01s(extra_data, 'SampleName',
                                          cancer_type)
    extra_data = util.add_identifier_column(extra_data, 'SampleName')
    extra_data = extra_data.drop('SampleName', axis=1)
    extra_data = extra_data.set_index('identifier')
    return extra_data
def prep_tumor_stage_data(tumor_stage_data_dir, cancer_type):
    tumor_stage_path = os.path.join(tumor_stage_data_dir,
                                    cancer_type + '_clinical.csv')
    if not os.path.isfile(tumor_stage_path):
        return None, None
    tumor_stage_data = pd.read_csv(tumor_stage_path, sep=',')
    tumor_stage_data = util.add_identifier_column(tumor_stage_data,
                                                  'patient_id')
    tumor_stage_data = tumor_stage_data.drop('patient_id', axis=1)
    tumor_stage_data = tumor_stage_data.set_index('identifier')
    tumor_stage_cols = [i for i in tumor_stage_data.columns if 'group' in i]

    return tumor_stage_data, tumor_stage_cols
Esempio n. 10
0
def prep_mutation_data_alone(mutation):
    df = pd.read_csv(mutation, sep='\t', low_memory=False, dtype=str)
    cancer_type = util.get_cancer_type(mutation)

    # remove column headers from combined mutation sheet
    df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
    df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()

    number_barcodes_in_mutation_data = df[u'Tumor_Sample_Barcode'].unique(
    ).size
    print 'Number of total sequenced barcodes:   ', number_barcodes_in_mutation_data
    df = util.maybe_clear_non_01s(df, u'Tumor_Sample_Barcode', cancer_type)
    df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')
    return df
def count_codons_in_file(f):
    cancer_type = util.get_cancer_type(f)
    print cancer_type

    df = pd.read_csv(f, sep='\t', low_memory=False)
    # Some of the columns are named Start_position. Others are Start_Position. some are start_position. :|
    upper_columns = [i.upper() for i in df.columns]
    start_pos_index = upper_columns.index('START_POSITION')
    start_pos = df.columns[start_pos_index]
    chromosome = u'Chromosome'

    ncbi_builds = df[u'NCBI_Build'].value_counts()
    if '36' in ncbi_builds.index:
        print 'Using translated NCBI build', cancer_type
        folder = os.path.dirname(f)
        new_path = os.path.join(folder, 'HG36_HG37',
                                cancer_type + '_hg36_hg37.txt')
        print new_path
        df = pd.read_csv(new_path, sep='\t', dtype=str)
        start_pos = u'hg37_start'
        chromsome = u'hg37_chr'
    wild_type_allele_col = u'Reference_Allele'

    df[u'Tumor_Sample_Barcode'] = df[u'Tumor_Sample_Barcode'].str.strip()
    df = df[~df[u'Hugo_Symbol'].str.contains('Hugo_Symbol')]
    df = df[df[u'Variant_Classification'].str.contains(
        'Missense')]  # only include missense
    df[u'Hugo_Symbol'] = '\'' + df[u'Hugo_Symbol'].astype(str)
    df = util.add_identifier_column(df, u'Tumor_Sample_Barcode')

    # Some files have the same mutation from different samples listed under one patient,
    # we only care about the number of patients with a given mutation, so drop duplicates
    df = df.drop_duplicates(
        subset=[u'Hugo_Symbol', chromosome, start_pos, u'identifier'],
        keep='last')
    counts = df.groupby(
        [u'Hugo_Symbol', chromosome, start_pos, wild_type_allele_col]).size()
    count_df = pd.DataFrame(counts)
    count_df.columns = [cancer_type]
    count_df.index.rename(
        ['Gene', 'Chromosome', 'Start Position', 'Wild Type Allele'],
        inplace=True)
    return count_df