def read_call_stats_file(self): fields = ['contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name', 'normal_name', 't_alt_count', 't_ref_count' , 'n_alt_count', 'n_ref_count','t_ref_sum','t_alt_sum', 'failure_reasons', 'judgement'] fields_type = {'contig': str, 'position': np.int, 'ref_allele': str, 'alt_allele': str, 'tumor_name': str, 'normal_name': str, 't_alt_count': np.int, 't_ref_count': np.int, 'n_alt_count': np.int, 'n_ref_count': np.int, 't_ref_sum': np.int,'t_alt_sum':np.int, 'failure_reasons': str, 'judgement': str} try: self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False, comment='#', usecols=fields, dtype=fields_type) except (ValueError, LookupError): print 'Error reading call stats skipping first two rows and trying again' self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False, comment='#', skiprows=2, usecols=fields, dtype=fields_type) if type(self.call_stats_table['contig'][0]) == str: self.call_stats_table['Chromosome'] = du.chr2num(np.array(self.call_stats_table['contig'])) else: self.call_stats_table['Chromosome'] = np.array(self.call_stats_table['contig']) - 1 self.call_stats_table = self.call_stats_table[np.isfinite(self.call_stats_table['Chromosome'])] self.call_stats_table['genomic_coord_x'] = du.hg19_to_linear_positions( np.array(self.call_stats_table['Chromosome']), np.array(self.call_stats_table['position'])) self.n_calls_in = len(self.call_stats_table) self.call_stats_table.reset_index(inplace=True, drop=True)
def read_call_stats_file(self): try: self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False, low_memory=False, comment='#') except (ValueError, LookupError): print 'Error reading call stats skipping first two rows and trying again' self.call_stats_table = pd.read_csv(self.call_stats_file, '\t', index_col=False, low_memory=False, comment='#', skiprows=2) if type(self.call_stats_table['contig'][0]) == str: self.call_stats_table['Chromosome'] = du.chr2num( np.array(self.call_stats_table['contig'])) else: self.call_stats_table['Chromosome'] = np.array( self.call_stats_table['contig']) - 1 self.call_stats_table = self.call_stats_table[np.isfinite( self.call_stats_table['Chromosome'])] self.call_stats_table['genomic_coord_x'] = du.hg19_to_linear_positions( np.array(self.call_stats_table['Chromosome']), np.array(self.call_stats_table['position'])) self.n_calls_in = len(self.call_stats_table) self.call_stats_table.reset_index(inplace=True, drop=True)
def read_het_file(self): tumor_het_table = pd.read_csv(self.tumor_het_file, '\t', index_col=False, low_memory=False, comment='#') normal_het_table = pd.read_csv(self.normal_het_file, '\t', index_col=False, low_memory=False, comment='#') tumor_het_table = du.fix_het_file_header(tumor_het_table) normal_het_table = du.fix_het_file_header(normal_het_table) if type(tumor_het_table['CONTIG'][0]) == str: tumor_het_table['Chromosome'] = du.chr2num( np.array(tumor_het_table['CONTIG'])) else: tumor_het_table['Chromosome'] = np.array(tumor_het_table['CONTIG']) if type(normal_het_table['CONTIG'][0]) == str: normal_het_table['Chromosome'] = du.chr2num( np.array(normal_het_table['CONTIG'])) else: normal_het_table['Chromosome'] = np.array( normal_het_table['CONTIG']) tumor_het_table = tumor_het_table[np.isfinite( tumor_het_table['Chromosome'])] tumor_het_table['genomic_coord_x'] = du.hg19_to_linear_positions( np.array(tumor_het_table['Chromosome']), np.array(tumor_het_table['POSITION'])) normal_het_table = normal_het_table[np.isfinite( normal_het_table['Chromosome'])] normal_het_table['genomic_coord_x'] = du.hg19_to_linear_positions( np.array(normal_het_table['Chromosome']), np.array(normal_het_table['POSITION'])) tumor_het_table['AF'] = np.true_divide( tumor_het_table['ALT_COUNT'], tumor_het_table['ALT_COUNT'] + tumor_het_table['REF_COUNT']) normal_het_table['AF'] = np.true_divide( normal_het_table['ALT_COUNT'], normal_het_table['ALT_COUNT'] + normal_het_table['REF_COUNT']) self.het_table = pd.merge(normal_het_table, tumor_het_table, on='genomic_coord_x', suffixes=('_N', '_T'))
def read_seg_file(self): if self.seg_file == 'NULL': self.seg_table = pd.DataFrame(index=[0],columns=['Chromosome','Start.bp','End.bp','n_probes','length','f','tau','genomic_coord_start','genomic_coord_end']) self.het_table = pd.DataFrame(index=[0],columns=['seg_id','tau','f','d','AF_T','AF_N','Chromosome','genomic_coord_x','ALT_COUNT_N' 'ALT_COUNT_T','REF_COUNT_N','REF_COUNT_T']) else: seg_header = du.read_file_header(self.seg_file) cols_seg_type = {seg_header[0]: str} self.seg_table = pd.read_csv(self.seg_file, '\t', index_col=False, low_memory=False, comment='#', dtype=cols_seg_type) self.seg_table = du.fix_seg_file_header(self.seg_table) self.seg_table['Chromosome'] = du.chr2num(np.array(self.seg_table['Chromosome'])) self.seg_table['genomic_coord_start'] = du.hg19_to_linear_positions(np.array(self.seg_table['Chromosome']), np.array(self.seg_table['Start.bp'])) self.seg_table['genomic_coord_end'] = du.hg19_to_linear_positions(np.array(self.seg_table['Chromosome']), np.array(self.seg_table['End.bp']))
def read_seg_file(self): self.seg_table = pd.read_csv(self.seg_file, '\t', index_col=False, low_memory=False, comment='#') self.seg_table = du.fix_seg_file_header(self.seg_table) if not du.is_number(self.seg_table['Chromosome'][0]): self.seg_table['Chromosome'] = du.chr2num( np.array(self.seg_table['Chromosome'])) else: self.seg_table['Chromosome'] = self.seg_table['Chromosome'] - 1 self.seg_table['genomic_coord_start'] = du.hg19_to_linear_positions( np.array(self.seg_table['Chromosome']), np.array(self.seg_table['Start.bp'])) self.seg_table['genomic_coord_end'] = du.hg19_to_linear_positions( np.array(self.seg_table['Chromosome']), np.array(self.seg_table['End.bp']))
def __init__(self, candidate_sites, p_somatic, resolution=101, f_thresh=0.15, depth=15, hot_spots_file='NA', skew=0.5): # variables follow notation: # ac = allele count n = normal t = tumor # Variables for SSNV fit self.TiN_range = np.linspace(0, 1, num=resolution) self.af = np.linspace(0.005, 1, num=200) # observed data self.contig = candidate_sites['contig'] self.position = candidate_sites['position'] self.genomic_coord_x = candidate_sites['genomic_coord_x'] self.n_alt_count = np.array(candidate_sites['n_alt_count']) self.n_ref_count = np.array(candidate_sites['n_ref_count']) self.n_depth = self.n_alt_count + self.n_ref_count self.normal_f = np.nan_to_num( np.true_divide(self.n_alt_count, self.n_depth)) self.t_alt_count = np.array(candidate_sites['t_alt_count']) self.t_ref_count = np.array(candidate_sites['t_ref_count']) self.t_depth = self.t_alt_count + self.t_ref_count self.tumor_f = np.true_divide(self.t_alt_count, self.t_depth) self.number_of_sites = len(self.n_alt_count) self.candidate_sites = np.logical_and( np.logical_and(self.tumor_f > f_thresh, self.t_depth > depth), self.n_depth > depth) # hyperparameter self.p_somatic = np.zeros([self.number_of_sites, 1]) + p_somatic if hot_spots_file != 'NA': hot_spots = pd.read_csv(hot_spots_file, sep='\t', low_memory=False, index_col=False) if type(hot_spots['Chromosome'][0]) == str: hot_spots['contig'] = du.chr2num( np.array(hot_spots['Chromosome'])) else: hot_spots['contig'] = np.array(hot_spots['Chromosome']) - 1 hot_spots = hot_spots[np.isfinite(hot_spots['contig'])] hot_spots['genomic_coord_x'] = du.hg19_to_linear_positions( np.array(hot_spots['contig']), np.array(hot_spots['Position'])) for index, hot_spot in hot_spots.iterrows(): if np.size( np.where(self.genomic_coord_x == hot_spot['genomic_coord_x'])) > 0: print 'Using user provided probabilities for cancer hot spots:' print hot_spot['Chromosome'] + ' ' + hot_spot['Position'] self.p_somatic[np.where( self.genomic_coord_x == hot_spot['genomic_coord_x'])] = hot_spot['Probability'] # parameter self.TiN = 0 self.CI_tin_high = [] self.CI_tin_low = [] self.E_z = np.zeros([self.number_of_sites, 1]) self.skew = skew # expected allele fraction of minor allele given allelic copy data self.psi = .5 - np.array(candidate_sites['f_acs']) self.t_het_direction = self.tumor_f < self.skew self.t_het_direction = self.t_het_direction * -1 self.t_het_direction[self.t_het_direction == 0] = 1 # determine ratio of tumor to normal copies given tau and TiN at each locus self.tau = candidate_sites['tau'] self.tin_correct_tau = np.multiply( self.TiN_range, candidate_sites['tau'][:, np.newaxis]) self.tin_correct_normal_tau = np.multiply((1 - self.TiN_range), 2) self.CN_ratio = np.divide( self.tin_correct_tau, np.array(self.tin_correct_tau + self.tin_correct_normal_tau)) # random variables self.rv_normal_af = beta(self.n_alt_count + 1, self.n_ref_count + 1) self.rv_tumor_af = beta(self.t_alt_count + 1, self.t_ref_count + 1) # conditionals self.p_TiN_given_S = np.zeros([self.number_of_sites, resolution]) self.p_TiN_given_G = np.zeros([self.number_of_sites, resolution]) self.p_TiN_given_het = np.zeros([self.number_of_sites, resolution]) self.p_artifact = np.zeros([self.number_of_sites, 1]) # likelihood self.TiN_likelihood = np.zeros([resolution, 1])