Exemple #1
0
 def read_call_stats_file(self):
     fields = ['contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name', 'normal_name', 't_alt_count',
               't_ref_count'
         , 'n_alt_count', 'n_ref_count','t_ref_sum','t_alt_sum', 'failure_reasons', 'judgement']
     fields_type = {'contig': str, 'position': np.int, 'ref_allele': str, 'alt_allele': str, 'tumor_name': str,
                    'normal_name': str,
                    't_alt_count': np.int, 't_ref_count': np.int, 'n_alt_count': np.int, 'n_ref_count': np.int,
                    't_ref_sum': np.int,'t_alt_sum':np.int,
                    'failure_reasons': str, 'judgement': str}
     try:
         self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False,
                                             comment='#', usecols=fields, dtype=fields_type)
     except (ValueError, LookupError):
         print 'Error reading call stats skipping first two rows and trying again'
         self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False,
                                             comment='#', skiprows=2, usecols=fields, dtype=fields_type)
     if type(self.call_stats_table['contig'][0]) == str:
         self.call_stats_table['Chromosome'] = du.chr2num(np.array(self.call_stats_table['contig']))
     else:
         self.call_stats_table['Chromosome'] = np.array(self.call_stats_table['contig']) - 1
     self.call_stats_table = self.call_stats_table[np.isfinite(self.call_stats_table['Chromosome'])]
     self.call_stats_table['genomic_coord_x'] = du.hg19_to_linear_positions(
         np.array(self.call_stats_table['Chromosome']), np.array(self.call_stats_table['position']))
     self.n_calls_in = len(self.call_stats_table)
     self.call_stats_table.reset_index(inplace=True, drop=True)
Exemple #2
0
 def read_call_stats_file(self):
     try:
         self.call_stats_table = pd.read_csv(self.call_stats_file,
                                             '\t',
                                             index_col=False,
                                             low_memory=False,
                                             comment='#')
     except (ValueError, LookupError):
         print 'Error reading call stats skipping first two rows and trying again'
         self.call_stats_table = pd.read_csv(self.call_stats_file,
                                             '\t',
                                             index_col=False,
                                             low_memory=False,
                                             comment='#',
                                             skiprows=2)
     if type(self.call_stats_table['contig'][0]) == str:
         self.call_stats_table['Chromosome'] = du.chr2num(
             np.array(self.call_stats_table['contig']))
     else:
         self.call_stats_table['Chromosome'] = np.array(
             self.call_stats_table['contig']) - 1
     self.call_stats_table = self.call_stats_table[np.isfinite(
         self.call_stats_table['Chromosome'])]
     self.call_stats_table['genomic_coord_x'] = du.hg19_to_linear_positions(
         np.array(self.call_stats_table['Chromosome']),
         np.array(self.call_stats_table['position']))
     self.n_calls_in = len(self.call_stats_table)
     self.call_stats_table.reset_index(inplace=True, drop=True)
Exemple #3
0
    def read_het_file(self):
        tumor_het_table = pd.read_csv(self.tumor_het_file,
                                      '\t',
                                      index_col=False,
                                      low_memory=False,
                                      comment='#')
        normal_het_table = pd.read_csv(self.normal_het_file,
                                       '\t',
                                       index_col=False,
                                       low_memory=False,
                                       comment='#')
        tumor_het_table = du.fix_het_file_header(tumor_het_table)
        normal_het_table = du.fix_het_file_header(normal_het_table)
        if type(tumor_het_table['CONTIG'][0]) == str:
            tumor_het_table['Chromosome'] = du.chr2num(
                np.array(tumor_het_table['CONTIG']))
        else:
            tumor_het_table['Chromosome'] = np.array(tumor_het_table['CONTIG'])

        if type(normal_het_table['CONTIG'][0]) == str:
            normal_het_table['Chromosome'] = du.chr2num(
                np.array(normal_het_table['CONTIG']))
        else:
            normal_het_table['Chromosome'] = np.array(
                normal_het_table['CONTIG'])
        tumor_het_table = tumor_het_table[np.isfinite(
            tumor_het_table['Chromosome'])]
        tumor_het_table['genomic_coord_x'] = du.hg19_to_linear_positions(
            np.array(tumor_het_table['Chromosome']),
            np.array(tumor_het_table['POSITION']))
        normal_het_table = normal_het_table[np.isfinite(
            normal_het_table['Chromosome'])]
        normal_het_table['genomic_coord_x'] = du.hg19_to_linear_positions(
            np.array(normal_het_table['Chromosome']),
            np.array(normal_het_table['POSITION']))
        tumor_het_table['AF'] = np.true_divide(
            tumor_het_table['ALT_COUNT'],
            tumor_het_table['ALT_COUNT'] + tumor_het_table['REF_COUNT'])
        normal_het_table['AF'] = np.true_divide(
            normal_het_table['ALT_COUNT'],
            normal_het_table['ALT_COUNT'] + normal_het_table['REF_COUNT'])
        self.het_table = pd.merge(normal_het_table,
                                  tumor_het_table,
                                  on='genomic_coord_x',
                                  suffixes=('_N', '_T'))
Exemple #4
0
    def read_seg_file(self):
        if self.seg_file == 'NULL':
            self.seg_table = pd.DataFrame(index=[0],columns=['Chromosome','Start.bp','End.bp','n_probes','length','f','tau','genomic_coord_start','genomic_coord_end'])
            self.het_table = pd.DataFrame(index=[0],columns=['seg_id','tau','f','d','AF_T','AF_N','Chromosome','genomic_coord_x','ALT_COUNT_N'
                                                             'ALT_COUNT_T','REF_COUNT_N','REF_COUNT_T'])
        else:
            seg_header = du.read_file_header(self.seg_file)
            cols_seg_type = {seg_header[0]: str}
            self.seg_table = pd.read_csv(self.seg_file, '\t', index_col=False, low_memory=False, comment='#',
                                     dtype=cols_seg_type)
            self.seg_table = du.fix_seg_file_header(self.seg_table)

            self.seg_table['Chromosome'] = du.chr2num(np.array(self.seg_table['Chromosome']))

            self.seg_table['genomic_coord_start'] = du.hg19_to_linear_positions(np.array(self.seg_table['Chromosome']),
                                                                            np.array(self.seg_table['Start.bp']))
            self.seg_table['genomic_coord_end'] = du.hg19_to_linear_positions(np.array(self.seg_table['Chromosome']),
                                                                          np.array(self.seg_table['End.bp']))
Exemple #5
0
 def read_seg_file(self):
     self.seg_table = pd.read_csv(self.seg_file,
                                  '\t',
                                  index_col=False,
                                  low_memory=False,
                                  comment='#')
     self.seg_table = du.fix_seg_file_header(self.seg_table)
     if not du.is_number(self.seg_table['Chromosome'][0]):
         self.seg_table['Chromosome'] = du.chr2num(
             np.array(self.seg_table['Chromosome']))
     else:
         self.seg_table['Chromosome'] = self.seg_table['Chromosome'] - 1
     self.seg_table['genomic_coord_start'] = du.hg19_to_linear_positions(
         np.array(self.seg_table['Chromosome']),
         np.array(self.seg_table['Start.bp']))
     self.seg_table['genomic_coord_end'] = du.hg19_to_linear_positions(
         np.array(self.seg_table['Chromosome']),
         np.array(self.seg_table['End.bp']))
    def __init__(self,
                 candidate_sites,
                 p_somatic,
                 resolution=101,
                 f_thresh=0.15,
                 depth=15,
                 hot_spots_file='NA',
                 skew=0.5):
        # variables follow notation:
        # ac = allele count n = normal t = tumor

        # Variables for SSNV fit
        self.TiN_range = np.linspace(0, 1, num=resolution)
        self.af = np.linspace(0.005, 1, num=200)

        # observed data
        self.contig = candidate_sites['contig']
        self.position = candidate_sites['position']
        self.genomic_coord_x = candidate_sites['genomic_coord_x']
        self.n_alt_count = np.array(candidate_sites['n_alt_count'])
        self.n_ref_count = np.array(candidate_sites['n_ref_count'])
        self.n_depth = self.n_alt_count + self.n_ref_count
        self.normal_f = np.nan_to_num(
            np.true_divide(self.n_alt_count, self.n_depth))
        self.t_alt_count = np.array(candidate_sites['t_alt_count'])
        self.t_ref_count = np.array(candidate_sites['t_ref_count'])
        self.t_depth = self.t_alt_count + self.t_ref_count
        self.tumor_f = np.true_divide(self.t_alt_count, self.t_depth)
        self.number_of_sites = len(self.n_alt_count)
        self.candidate_sites = np.logical_and(
            np.logical_and(self.tumor_f > f_thresh, self.t_depth > depth),
            self.n_depth > depth)
        # hyperparameter
        self.p_somatic = np.zeros([self.number_of_sites, 1]) + p_somatic
        if hot_spots_file != 'NA':
            hot_spots = pd.read_csv(hot_spots_file,
                                    sep='\t',
                                    low_memory=False,
                                    index_col=False)
            if type(hot_spots['Chromosome'][0]) == str:
                hot_spots['contig'] = du.chr2num(
                    np.array(hot_spots['Chromosome']))
            else:
                hot_spots['contig'] = np.array(hot_spots['Chromosome']) - 1
            hot_spots = hot_spots[np.isfinite(hot_spots['contig'])]
            hot_spots['genomic_coord_x'] = du.hg19_to_linear_positions(
                np.array(hot_spots['contig']), np.array(hot_spots['Position']))
            for index, hot_spot in hot_spots.iterrows():
                if np.size(
                        np.where(self.genomic_coord_x ==
                                 hot_spot['genomic_coord_x'])) > 0:
                    print 'Using user provided probabilities for cancer hot spots:'
                    print hot_spot['Chromosome'] + ' ' + hot_spot['Position']
                    self.p_somatic[np.where(
                        self.genomic_coord_x ==
                        hot_spot['genomic_coord_x'])] = hot_spot['Probability']

        # parameter
        self.TiN = 0
        self.CI_tin_high = []
        self.CI_tin_low = []
        self.E_z = np.zeros([self.number_of_sites, 1])
        self.skew = skew
        # expected allele fraction of minor allele given allelic copy data
        self.psi = .5 - np.array(candidate_sites['f_acs'])
        self.t_het_direction = self.tumor_f < self.skew
        self.t_het_direction = self.t_het_direction * -1
        self.t_het_direction[self.t_het_direction == 0] = 1

        # determine ratio of tumor to normal copies given tau and TiN at each locus
        self.tau = candidate_sites['tau']
        self.tin_correct_tau = np.multiply(
            self.TiN_range, candidate_sites['tau'][:, np.newaxis])
        self.tin_correct_normal_tau = np.multiply((1 - self.TiN_range), 2)
        self.CN_ratio = np.divide(
            self.tin_correct_tau,
            np.array(self.tin_correct_tau + self.tin_correct_normal_tau))

        # random variables
        self.rv_normal_af = beta(self.n_alt_count + 1, self.n_ref_count + 1)
        self.rv_tumor_af = beta(self.t_alt_count + 1, self.t_ref_count + 1)

        # conditionals
        self.p_TiN_given_S = np.zeros([self.number_of_sites, resolution])
        self.p_TiN_given_G = np.zeros([self.number_of_sites, resolution])
        self.p_TiN_given_het = np.zeros([self.number_of_sites, resolution])
        self.p_artifact = np.zeros([self.number_of_sites, 1])

        # likelihood
        self.TiN_likelihood = np.zeros([resolution, 1])