def calculate_distance_distribution_in_bam(bam_fhand, max_clipping, max_distance=None): bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def test_draw_histogram_in_axes(self): values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4] fhand = NamedTemporaryFile(suffix='.png') counter = IntCounter(values) distrib = counter.calculate_distribution() axes, canvas = draw_histogram_in_axes(distrib['counts'], distrib['bin_limits'], kind=LINE, distrib_label='test') axes.legend() canvas.print_figure(fhand, format='png') fhand.flush() # raw_input(fhand.name) # ylimit test values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4, 0, 5, 4, 4, 4, 4, 4] fhand = NamedTemporaryFile(suffix='.png') counter = IntCounter(values) distrib = counter.calculate_distribution() axes, canvas = draw_histogram_in_axes(distrib['counts'], distrib['bin_limits'], kind=LINE, distrib_label='test', ylimits=(None, 4)) axes.legend() canvas.print_figure(fhand, format='png') fhand.flush()
def test__add__(): ext_counter = IntCounter({6: 1, 2: 1}) ext_counter2 = IntCounter({7: 1, 2: 1}) new_array = ext_counter + ext_counter2 assert new_array[6] == 1 assert new_array[7] == 1 assert new_array[2] == 2
def calc_snv_read_pos_stats(sam, snvs, max_snps=None, max_pos=None): "This implementation is using pysam pysam" pileup_cols = sam.pileup() read_5_pos_cnts_rg = {} read_3_pos_cnts_rg = {} read_5_pos_box_rg = {} read_3_pos_box_rg = {} for index, snv in enumerate(snvs): if max_snps and index >= max_snps: break chrom = snv.chrom ref_pos = snv.pos snv_qual = snv.qual snv_col = None for col in pileup_cols: ref_name = sam.getrname(col.reference_id) if ref_name == chrom and col.reference_pos == ref_pos: snv_col = col break if snv_col is None: raise RuntimeError('No pileup found for snv {}:{}'.format( chrom, ref_pos)) for pileup_read in snv_col.pileups: try: read_group = pileup_read.alignment.opt('RG') except KeyError: read_group = None read_ref_coord = ReadRefCoord(pileup_read.alignment, sam) read_pos = read_ref_coord.get_read_pos((chrom, ref_pos)) read_pos_end = read_ref_coord.get_read_pos_counting_from_end( (chrom, ref_pos)) if read_group not in read_5_pos_cnts_rg: read_5_pos_cnts_rg[read_group] = IntCounter() read_3_pos_cnts_rg[read_group] = IntCounter() read_5_pos_box_rg[read_group] = IntBoxplot() read_3_pos_box_rg[read_group] = IntBoxplot() read_5_pos_cnts = read_5_pos_cnts_rg[read_group] read_3_pos_cnts = read_3_pos_cnts_rg[read_group] read_5_pos_box = read_5_pos_box_rg[read_group] read_3_pos_box = read_3_pos_box_rg[read_group] if (read_pos is not None and (not max_pos or read_pos + 1 <= max_pos)): read_5_pos_cnts[read_pos + 1] += 1 read_5_pos_box.append(read_pos + 1, snv_qual) if (read_pos_end is not None and (not max_pos or abs(read_pos_end) <= max_pos)): read_3_pos_cnts[abs(read_pos_end)] += 1 read_3_pos_box.append(abs(read_pos_end), snv_qual) return { '5_read_pos_counts': read_5_pos_cnts_rg, '3_read_pos_counts': read_3_pos_cnts_rg, '5_read_pos_boxplot': read_5_pos_box_rg, '3_read_pos_boxplot': read_3_pos_box_rg }
def test_counter(): 'create a counter' # initialize with values counter = IntCounter({2: 2}) counter[2] += 1 counter[6] += 1 assert counter.min == 2 assert counter.max == 6 counter = IntCounter({3: 1, 5: 1, 7: 2, 38: 1}) assert counter.min == 3 assert counter.max == 38 assert counter.sum == 60 assert counter.count == 5 assert counter.median == 7
def calc_snv_read_pos_stats2(sam, snvs, max_snps=None, max_pos=None): "this implementation is using pysam fetch" read_5_pos_cnts_rg = {} read_3_pos_cnts_rg = {} read_5_pos_box_rg = {} read_3_pos_box_rg = {} for index, snv in enumerate(snvs): if max_snps and index >= max_snps: break chrom = snv.chrom ref_pos = snv.pos snv_qual = snv.qual for alignment_read in sam.fetch(chrom, ref_pos, ref_pos + 1): try: read_group = alignment_read.opt('RG') except KeyError: read_group = None read_ref_coord = ReadRefCoord(alignment_read, sam) read_pos = read_ref_coord.get_read_pos((chrom, ref_pos)) read_pos_end = read_ref_coord.get_read_pos_counting_from_end( (chrom, ref_pos)) if read_group not in read_5_pos_cnts_rg: read_5_pos_cnts_rg[read_group] = IntCounter() read_3_pos_cnts_rg[read_group] = IntCounter() read_5_pos_box_rg[read_group] = IntBoxplot() read_3_pos_box_rg[read_group] = IntBoxplot() read_5_pos_cnts = read_5_pos_cnts_rg[read_group] read_3_pos_cnts = read_3_pos_cnts_rg[read_group] read_5_pos_box = read_5_pos_box_rg[read_group] read_3_pos_box = read_3_pos_box_rg[read_group] if (read_pos is not None and (not max_pos or read_pos + 1 <= max_pos)): read_5_pos_cnts[read_pos + 1] += 1 read_5_pos_box.append(read_pos + 1, snv_qual) if (read_pos_end is not None and (not max_pos or abs(read_pos_end) <= max_pos)): read_3_pos_cnts[abs(read_pos_end)] += 1 read_3_pos_box.append(abs(read_pos_end), snv_qual) return { '5_read_pos_counts': read_5_pos_cnts_rg, '3_read_pos_counts': read_3_pos_cnts_rg, '5_read_pos_boxplot': read_5_pos_box_rg, '3_read_pos_boxplot': read_3_pos_box_rg }
def test_histogram_plotter(self): values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4] counter = IntCounter(values) counters = [counter] histo_ploter = HistogramPlotter(counters) assert len(histo_ploter.axes) == len(counters) fhand = NamedTemporaryFile(suffix='.png') histo_ploter.write_figure(fhand) # raw_input(fhand.name) # Add more intcounters counters.append(IntCounter(values)) counters.append(IntCounter(values)) histo_ploter = HistogramPlotter(counters, distrib_labels=['1', '2', '3']) assert len(histo_ploter.axes) == len(counters)
def mapped_count_by_rg(bam_fpaths, mapqx=None): do_mapqx = True if mapqx is not None else False counter_by_rg = {} for bam_fpath in bam_fpaths: bam = pysam.Samfile(bam_fpath, 'rb') readgroups = get_bam_readgroups(bam) if readgroups is None: bam_basename = os.path.splitext(os.path.basename(bam_fpath))[0] readgroups = [bam_basename] else: readgroups = [rg['ID'] for rg in readgroups] for readgroup in readgroups: counter = IntCounter({'unmapped': 0, 'mapped': 0}) if do_mapqx: counter['bigger_mapqx'] = 0 counter_by_rg[readgroup] = counter for read in bam: rg = get_rg_from_alignedread(read) if rg is None: rg = bam_basename if do_mapqx and read.mapq >= mapqx: counter_by_rg[rg]['bigger_mapqx'] += 1 if read.is_unmapped: counter_by_rg[rg]['unmapped'] += 1 else: counter_by_rg[rg]['mapped'] += 1 return counter_by_rg
def calculate_coverage_distrib_in_region(self, region=None): if region is None: if self.window == 1: regions = None else: regions = [(ref, 0, le_ - 1) for ref, le_ in self._ref_lens.items()] else: regions = [region] if self.window == 1: if regions is None: region = None else: region = regions[0] return self._calculate_complete_coverage_distrib(region) counts = {} for region in regions: chrom, start, end = region for start, end in generate_windows(self.window, start=0, end=self._ref_lens[chrom], step=1): counts_in_win = self._calculate_coverages_in_win( chrom, start, end) for sample, cnts_in_win in counts_in_win.items(): if sample not in counts: counts[sample] = IntCounter() counts[sample][int(round(cnts_in_win))] += 1 return counts
def test_stats_functs(self): 'It test the statistical functions of the class' ints = IntCounter({3: 1, 5: 1, 7: 2, 38: 1}) assert ints.median == 7 ints = IntCounter({3: 1, 5: 1, 7: 1, 38: 1}) assert ints.median == 6 # median with two middle numbers ext_array = IntCounter({3: 1, 5: 1, 7: 2}) assert ext_array.median == 6 ints = IntCounter({34: 1, 43: 1, 81: 1, 106: 2, 115: 1}) assert ints.average - 80.83 < 0.01 ext_counter = self.create_test_counter() assert ext_counter.median == 145 assert round(ext_counter.average, 2) == 145.15 assert ext_counter.sum == 13354 assert ext_counter.count == 92 assert round(ext_counter.variance, 2) == 557.43 ints = IntCounter({3: 1, 4: 2, 5: 1, 6: 1, 8: 2}) assert ints.median == 5 assert ints.quartiles == (4, 5, 8) ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1}) assert ints.quartiles == (1.5, 3, 4.5) ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}) assert ints.quartiles == (1.5, 3.5, 5.5) ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}) assert ints.quartiles == (2, 4, 6) ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}) assert ints.quartiles == (2.5, 4.5, 6.5) assert ints.irq == 4.0 assert ints.outlier_limits == (-3, 12) try: ints = IntCounter({0: 1, 1: 1, 2: 1}) assert ints.quartiles self.fail('RuntimeError') except RuntimeError: pass
def test_draw_histogram_in_fhand(self): values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4] fhand = NamedTemporaryFile(suffix='.png') counter = IntCounter(values) distrib = counter.calculate_distribution() draw_histogram_in_fhand(distrib['counts'], distrib['bin_limits'], fhand=fhand)
def __init__(self, bam_fpaths, coverage_threshold, window, min_mapq=None): self._coverage_threshold = coverage_threshold self._window = window self._min_mapq = min_mapq self._bam_coverage = BamCoverages2(bam_fpaths, min_mapq=min_mapq, window=window) self._scores = {samp: IntCounter() for samp in self._bam_coverage.samples}
def __call__(self, snv): if self._first_snv: for call in snv.calls: self._scores[call.sample] = IntCounter() self._first_snv = False for call in snv.calls: if call.gt_qual is not None: self._scores[call.sample][int(call.gt_qual)] += 1 return snv.remove_gt_from_low_qual_calls(min_qual=self._min_qual)
def create_test_counter(): counter = IntCounter() d = {'9': '5', '10': '288', '11': '002556688', '12': '00012355555', '13': '0000013555688', '14': '00002555558', '15': '0000000000355555555557', '16': '000045', '17': '000055', '18': '0005', '19': '00005', '21': '5'} for key, values in d.items(): for num in values: counter[int(key + num)] += 1 return counter
def _calculate_complete_coverage_distrib(self, region): if region is None: chrom, start, end = None, None, None else: chrom, start, end = region min_mapq = self.min_mapq covs = {sample: IntCounter() for sample in self.samples} covs[None] = IntCounter() for bam in self._bams: columns = bam['bam'].pileup(reference=chrom, start=start, end=end, stepper=self.bam_pileup_stepper, truncate=True) one_sample, sample = self._if_one_sample_get_it(bam) for column in columns: col_counts = self._count_reads_in_column(column, min_mapq, one_sample, sample) for sample, sample_cov in col_counts.items(): covs[sample][sample_cov] += 1 return covs
def get_genome_coverage(bam_fhands): coverage_hist = IntCounter() for bam_fhand in bam_fhands: bam_fpath = bam_fhand.name cmd = [get_binary_path('bedtools'), 'genomecov', '-ibam', bam_fpath] cover_process = Popen(cmd, stdout=PIPE) for line in cover_process.stdout: if line.startswith('genome'): cov, value = line.split('\t')[1:3] coverage_hist[int(cov)] += int(value) return coverage_hist
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def calculate_coverage_distrib_in_region(self, region=None): counts = {} for bam in self._bams: one_rg = True if len(bam['rgs']) < 2 else False for read_group in bam['rgs']: sample_field = self.bam_rg_field_for_vcf_sample sample = read_group[sample_field] if sample not in counts: counts[sample] = IntCounter() for cov in self._get_coverages_in_bam_rg_win( region, bam, read_group, one_rg): counts[sample][int(round(cov))] += 1 return counts
def _get_sample_counter(self, kind, sample=None, gt_broud_type=None): counters = self._sample_counters[kind] if sample is not None: if gt_broud_type is None: return counters[sample] else: return counters[sample][gt_broud_type] all_counters = IntCounter() for sample_counter in counters.values(): if gt_broud_type is None: all_counters += sample_counter else: all_counters += sample_counter[gt_broud_type] return all_counters
def test_value_for_index_test(self): 'We can get the integer for a given index' # pylint: disable=W0212 ints = IntCounter({3: 1, 5: 1, 7: 2, 38: 1}) assert ints._get_value_for_index(0) == 3 assert ints._get_value_for_index(1) == 5 assert ints._get_value_for_index(2) == 7 assert ints._get_value_for_index(3) == 7 assert ints._get_value_for_index(4) == 38 try: assert ints._get_value_for_index(5) == 38 self.fail('IndexError expected') except IndexError: pass
def test_distribution(self): 'It tests the histogram function' ints_counter = self.create_test_counter() distrib = ints_counter.calculate_distribution(bins=10, outlier_threshold=5) assert distrib['counts'] == [7L, 13L, 7L, 10L, 7L, 22L, 6L, 4L, 5L, 5L] assert distrib['bin_limits'] == [ 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190 ] assert 'average' in str(ints_counter) ints_counter = IntCounter({0: 2, 1: 1, 3: 1}) result = [2, 1, 1] assert ints_counter.calculate_distribution(bins=3)['counts'] == result
def _count_reads(self): nreferences = self._bams[0].nreferences rpks = zeros(nreferences) references = [] length_counts = IntCounter() first_bam = True n_reads = 0 for bam in self._bams: if bam.nreferences != nreferences: msg = 'BAM files should have the same references' raise ValueError(msg) for index, count in enumerate(get_reference_counts(bam.filename)): n_reads += count['unmapped_reads'] + count['mapped_reads'] if count['reference'] is None: # some non-mapped reads have reference = None continue kb_len = count['length'] / 1000 rpk = count['mapped_reads'] / kb_len rpks[index] += rpk if first_bam: # For the reference lengths we use the first BAM to make references.append(count['reference']) length_counts[count['length']] += 1 else: # the bams should be sorted with the references in the same # order if references[index] != count['reference']: msg = 'The reference lengths do not match in the bams' raise RuntimeError(msg) first_bam = False million_reads = n_reads / 1e6 rpks /= million_reads # rpkms self._rpkms = ArrayWrapper(rpks, bins=self._bins) abundant_refs = BestItemsKeeper(self._n_most_expressed_reads, izip(references, rpks), key=itemgetter(1)) abundant_refs = [{ 'reference': i[0], 'rpkm': i[1] } for i in abundant_refs] self._most_abundant_refs = abundant_refs self._lengths = length_counts
def tests_draw_histograms(self): fhand = NamedTemporaryFile(suffix='.png') values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4] counters = [] counters.append(IntCounter(values)) counters.append(IntCounter(values)) counters.append(IntCounter(values)) titles = ['t1', 't2', 't3'] draw_histograms(counters, fhand, titles=titles, plots_per_chart=2) # raw_input(fhand.name) values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4, 0, 5, 4, 4, 4, 4, 4] counters = [] counters.append(IntCounter(values)) counters.append(IntCounter(values)) counters.append(IntCounter(values)) titles = ['t1', 't2', 't3'] draw_histograms(counters, fhand, titles=titles, plots_per_chart=2, ylimits=(0, 14))
def test_n50_calculation(): 'It calculates N50.' assert calculate_nx(IntCounter([2, 2, 2, 3, 3, 4, 8, 8]), 50) == 8 assert calculate_nx(IntCounter([2, 2, 2, 3, 3, 4, 8, 8]), 95) == 2 assert calculate_nx(IntCounter([8, 8, 8, 8, 8, 8, 8, 8]), 50) == 8 assert calculate_nx(IntCounter(), 50) is None
def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader( open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = { MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter() } self._calculate()
def __init__(self, bam_fhands, mapqs=MAPQS_TO_CALCULATE): self._bam_fhands = bam_fhands self.mapqs_to_calculate = mapqs self._counters = {mapq: IntCounter() for mapq in mapqs} self._calculate()
def __init__(self, bams): # TODO flag, read_group self._bams = bams self._mapqs = IntCounter() self._flag_counts = {} self._count_mapqs()
def test_draw_histogram_in_fhand(self): values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4] fhand = NamedTemporaryFile(suffix='.png') counter = IntCounter(values) draw_histogram_in_fhand(counter, fhand=fhand)
def test_sum_with_treshold_function(): 'It tests the function that calculates Q30 and Q20' quals = IntCounter({15: 10, 21: 13, 30: 12}) assert quals.count_relative_to_value(20, operator.ge) == 25 assert quals.count_relative_to_value(30, operator.ge) == 12