def calc_snv_read_pos_stats(sam, snvs, max_snps=None, max_pos=None): "This implementation is using pysam pysam" pileup_cols = sam.pileup() read_5_pos_cnts_rg = {} read_3_pos_cnts_rg = {} read_5_pos_box_rg = {} read_3_pos_box_rg = {} for index, snv in enumerate(snvs): if max_snps and index >= max_snps: break chrom = snv.chrom ref_pos = snv.pos snv_qual = snv.qual snv_col = None for col in pileup_cols: ref_name = sam.getrname(col.reference_id) if ref_name == chrom and col.reference_pos == ref_pos: snv_col = col break if snv_col is None: raise RuntimeError('No pileup found for snv {}:{}'.format( chrom, ref_pos)) for pileup_read in snv_col.pileups: try: read_group = pileup_read.alignment.opt('RG') except KeyError: read_group = None read_ref_coord = ReadRefCoord(pileup_read.alignment, sam) read_pos = read_ref_coord.get_read_pos((chrom, ref_pos)) read_pos_end = read_ref_coord.get_read_pos_counting_from_end( (chrom, ref_pos)) if read_group not in read_5_pos_cnts_rg: read_5_pos_cnts_rg[read_group] = IntCounter() read_3_pos_cnts_rg[read_group] = IntCounter() read_5_pos_box_rg[read_group] = IntBoxplot() read_3_pos_box_rg[read_group] = IntBoxplot() read_5_pos_cnts = read_5_pos_cnts_rg[read_group] read_3_pos_cnts = read_3_pos_cnts_rg[read_group] read_5_pos_box = read_5_pos_box_rg[read_group] read_3_pos_box = read_3_pos_box_rg[read_group] if (read_pos is not None and (not max_pos or read_pos + 1 <= max_pos)): read_5_pos_cnts[read_pos + 1] += 1 read_5_pos_box.append(read_pos + 1, snv_qual) if (read_pos_end is not None and (not max_pos or abs(read_pos_end) <= max_pos)): read_3_pos_cnts[abs(read_pos_end)] += 1 read_3_pos_box.append(abs(read_pos_end), snv_qual) return { '5_read_pos_counts': read_5_pos_cnts_rg, '3_read_pos_counts': read_3_pos_cnts_rg, '5_read_pos_boxplot': read_5_pos_box_rg, '3_read_pos_boxplot': read_3_pos_box_rg }
def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader( open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = { MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter() } self._calculate()
def test_int_boxplot(self): box = IntBoxplot() box.append(1, 50) box.append(1, 40) box.append(1, 30) box.append(1, 40) box.append(2, 30) box.append(2, 10) box.append(2, 20) box.append(2, 40) fhand = NamedTemporaryFile(suffix='.png') draw_int_boxplot(box, fhand=fhand)
def calc_snv_read_pos_stats2(sam, snvs, max_snps=None, max_pos=None): "this implementation is using pysam fetch" read_5_pos_cnts_rg = {} read_3_pos_cnts_rg = {} read_5_pos_box_rg = {} read_3_pos_box_rg = {} for index, snv in enumerate(snvs): if max_snps and index >= max_snps: break chrom = snv.chrom ref_pos = snv.pos snv_qual = snv.qual for alignment_read in sam.fetch(chrom, ref_pos, ref_pos + 1): try: read_group = alignment_read.opt('RG') except KeyError: read_group = None read_ref_coord = ReadRefCoord(alignment_read, sam) read_pos = read_ref_coord.get_read_pos((chrom, ref_pos)) read_pos_end = read_ref_coord.get_read_pos_counting_from_end( (chrom, ref_pos)) if read_group not in read_5_pos_cnts_rg: read_5_pos_cnts_rg[read_group] = IntCounter() read_3_pos_cnts_rg[read_group] = IntCounter() read_5_pos_box_rg[read_group] = IntBoxplot() read_3_pos_box_rg[read_group] = IntBoxplot() read_5_pos_cnts = read_5_pos_cnts_rg[read_group] read_3_pos_cnts = read_3_pos_cnts_rg[read_group] read_5_pos_box = read_5_pos_box_rg[read_group] read_3_pos_box = read_3_pos_box_rg[read_group] if (read_pos is not None and (not max_pos or read_pos + 1 <= max_pos)): read_5_pos_cnts[read_pos + 1] += 1 read_5_pos_box.append(read_pos + 1, snv_qual) if (read_pos_end is not None and (not max_pos or abs(read_pos_end) <= max_pos)): read_3_pos_cnts[abs(read_pos_end)] += 1 read_3_pos_box.append(abs(read_pos_end), snv_qual) return { '5_read_pos_counts': read_5_pos_cnts_rg, '3_read_pos_counts': read_3_pos_cnts_rg, '5_read_pos_boxplot': read_5_pos_box_rg, '3_read_pos_boxplot': read_3_pos_box_rg }
def test_boxplot(self): 'It does a bloxplot for integers' box = IntBoxplot() box.append(1, 50) box.append(1, 40) box.append(1, 30) box.append(1, 40) box.append(2, 30) box.append(2, 10) box.append(2, 20) box.append(2, 40) box.append('no distrib', 40) counts = box.aggregated_array assert sum(counts.values()) == 9 plot = box.ascii_plot assert '2:10.0,15.0,25.0,35.0,40.0 <-----[============|=======' in plot