Exemple #1
0
    def test_draw_histogram_in_axes(self):
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
        fhand = NamedTemporaryFile(suffix='.png')
        counter = IntCounter(values)
        distrib = counter.calculate_distribution()
        axes, canvas = draw_histogram_in_axes(distrib['counts'],
                                              distrib['bin_limits'],
                                              kind=LINE,
                                              distrib_label='test')
        axes.legend()
        canvas.print_figure(fhand, format='png')
        fhand.flush()
        # raw_input(fhand.name)

        # ylimit test
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4, 0, 5, 4, 4, 4, 4, 4]
        fhand = NamedTemporaryFile(suffix='.png')
        counter = IntCounter(values)
        distrib = counter.calculate_distribution()
        axes, canvas = draw_histogram_in_axes(distrib['counts'],
                                              distrib['bin_limits'],
                                              kind=LINE,
                                              distrib_label='test',
                                              ylimits=(None, 4))
        axes.legend()
        canvas.print_figure(fhand, format='png')
        fhand.flush()
Exemple #2
0
 def test_draw_histogram_in_fhand(self):
     values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
     fhand = NamedTemporaryFile(suffix='.png')
     counter = IntCounter(values)
     distrib = counter.calculate_distribution()
     draw_histogram_in_fhand(distrib['counts'], distrib['bin_limits'],
                             fhand=fhand)
Exemple #3
0
def calculate_distance_distribution_in_bam(bam_fhand,
                                           max_clipping,
                                           max_distance=None):
    bamfile = AlignmentFile(bam_fhand.name)
    stats = {
        'outies': IntCounter(),
        'innies': IntCounter(),
        'others': IntCounter()
    }
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(
                mates[0], max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(
                    mates[1], max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats
Exemple #4
0
    def test__add__():
        ext_counter = IntCounter({6: 1, 2: 1})
        ext_counter2 = IntCounter({7: 1, 2: 1})

        new_array = ext_counter + ext_counter2
        assert new_array[6] == 1
        assert new_array[7] == 1
        assert new_array[2] == 2
Exemple #5
0
 def test_draw_histogram_in_fhand(self):
     values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
     fhand = NamedTemporaryFile(suffix='.png')
     counter = IntCounter(values)
     distrib = counter.calculate_distribution()
     draw_histogram_in_fhand(distrib['counts'],
                             distrib['bin_limits'],
                             fhand=fhand)
Exemple #6
0
def calc_snv_read_pos_stats(sam, snvs, max_snps=None, max_pos=None):
    "This implementation is using pysam pysam"
    pileup_cols = sam.pileup()
    read_5_pos_cnts_rg = {}
    read_3_pos_cnts_rg = {}
    read_5_pos_box_rg = {}
    read_3_pos_box_rg = {}

    for index, snv in enumerate(snvs):
        if max_snps and index >= max_snps:
            break
        chrom = snv.chrom
        ref_pos = snv.pos
        snv_qual = snv.qual
        snv_col = None
        for col in pileup_cols:
            ref_name = sam.getrname(col.reference_id)
            if ref_name == chrom and col.reference_pos == ref_pos:
                snv_col = col
                break
        if snv_col is None:
            raise RuntimeError('No pileup found for snv {}:{}'.format(
                chrom, ref_pos))

        for pileup_read in snv_col.pileups:
            try:
                read_group = pileup_read.alignment.opt('RG')
            except KeyError:
                read_group = None
            read_ref_coord = ReadRefCoord(pileup_read.alignment, sam)
            read_pos = read_ref_coord.get_read_pos((chrom, ref_pos))
            read_pos_end = read_ref_coord.get_read_pos_counting_from_end(
                (chrom, ref_pos))
            if read_group not in read_5_pos_cnts_rg:
                read_5_pos_cnts_rg[read_group] = IntCounter()
                read_3_pos_cnts_rg[read_group] = IntCounter()
                read_5_pos_box_rg[read_group] = IntBoxplot()
                read_3_pos_box_rg[read_group] = IntBoxplot()
            read_5_pos_cnts = read_5_pos_cnts_rg[read_group]
            read_3_pos_cnts = read_3_pos_cnts_rg[read_group]
            read_5_pos_box = read_5_pos_box_rg[read_group]
            read_3_pos_box = read_3_pos_box_rg[read_group]

            if (read_pos is not None
                    and (not max_pos or read_pos + 1 <= max_pos)):
                read_5_pos_cnts[read_pos + 1] += 1
                read_5_pos_box.append(read_pos + 1, snv_qual)
            if (read_pos_end is not None
                    and (not max_pos or abs(read_pos_end) <= max_pos)):
                read_3_pos_cnts[abs(read_pos_end)] += 1
                read_3_pos_box.append(abs(read_pos_end), snv_qual)

    return {
        '5_read_pos_counts': read_5_pos_cnts_rg,
        '3_read_pos_counts': read_3_pos_cnts_rg,
        '5_read_pos_boxplot': read_5_pos_box_rg,
        '3_read_pos_boxplot': read_3_pos_box_rg
    }
 def test_value_for_index_test(self):
     'We can get the integer for a given index'
     # pylint: disable=W0212
     ints = IntCounter({3: 1, 5: 1, 7: 2, 38: 1})
     assert ints._get_value_for_index(0) == 3
     assert ints._get_value_for_index(1) == 5
     assert ints._get_value_for_index(2) == 7
     assert ints._get_value_for_index(3) == 7
     assert ints._get_value_for_index(4) == 38
     try:
         assert ints._get_value_for_index(5) == 38
         self.fail('IndexError expected')
     except IndexError:
         pass
Exemple #8
0
    def test_counter():
        'create a counter'
        # initialize with values
        counter = IntCounter({2: 2})
        counter[2] += 1
        counter[6] += 1
        assert counter.min == 2
        assert counter.max == 6

        counter = IntCounter({3: 1, 5: 1, 7: 2, 38: 1})
        assert counter.min == 3
        assert counter.max == 38
        assert counter.sum == 60
        assert counter.count == 5
        assert counter.median == 7
Exemple #9
0
def calc_snv_read_pos_stats2(sam, snvs, max_snps=None, max_pos=None):
    "this implementation is using pysam fetch"
    read_5_pos_cnts_rg = {}
    read_3_pos_cnts_rg = {}
    read_5_pos_box_rg = {}
    read_3_pos_box_rg = {}

    for index, snv in enumerate(snvs):
        if max_snps and index >= max_snps:
            break
        chrom = snv.chrom
        ref_pos = snv.pos
        snv_qual = snv.qual
        for alignment_read in sam.fetch(chrom, ref_pos, ref_pos + 1):
            try:
                read_group = alignment_read.opt('RG')
            except KeyError:
                read_group = None

            read_ref_coord = ReadRefCoord(alignment_read, sam)
            read_pos = read_ref_coord.get_read_pos((chrom, ref_pos))
            read_pos_end = read_ref_coord.get_read_pos_counting_from_end(
                (chrom, ref_pos))
            if read_group not in read_5_pos_cnts_rg:
                read_5_pos_cnts_rg[read_group] = IntCounter()
                read_3_pos_cnts_rg[read_group] = IntCounter()
                read_5_pos_box_rg[read_group] = IntBoxplot()
                read_3_pos_box_rg[read_group] = IntBoxplot()
            read_5_pos_cnts = read_5_pos_cnts_rg[read_group]
            read_3_pos_cnts = read_3_pos_cnts_rg[read_group]
            read_5_pos_box = read_5_pos_box_rg[read_group]
            read_3_pos_box = read_3_pos_box_rg[read_group]

            if (read_pos is not None
                    and (not max_pos or read_pos + 1 <= max_pos)):
                read_5_pos_cnts[read_pos + 1] += 1
                read_5_pos_box.append(read_pos + 1, snv_qual)
            if (read_pos_end is not None
                    and (not max_pos or abs(read_pos_end) <= max_pos)):
                read_3_pos_cnts[abs(read_pos_end)] += 1
                read_3_pos_box.append(abs(read_pos_end), snv_qual)

    return {
        '5_read_pos_counts': read_5_pos_cnts_rg,
        '3_read_pos_counts': read_3_pos_cnts_rg,
        '5_read_pos_boxplot': read_5_pos_box_rg,
        '3_read_pos_boxplot': read_3_pos_box_rg
    }
Exemple #10
0
    def calculate_coverage_distrib_in_region(self, region=None):
        if region is None:
            if self.window == 1:
                regions = None
            else:
                regions = [(ref, 0, le_ - 1)
                           for ref, le_ in self._ref_lens.items()]
        else:
            regions = [region]

        if self.window == 1:
            if regions is None:
                region = None
            else:
                region = regions[0]
            return self._calculate_complete_coverage_distrib(region)

        counts = {}
        for region in regions:
            chrom, start, end = region
            for start, end in generate_windows(self.window,
                                               start=0,
                                               end=self._ref_lens[chrom],
                                               step=1):
                counts_in_win = self._calculate_coverages_in_win(
                    chrom, start, end)
                for sample, cnts_in_win in counts_in_win.items():
                    if sample not in counts:
                        counts[sample] = IntCounter()
                    counts[sample][int(round(cnts_in_win))] += 1

        return counts
Exemple #11
0
def mapped_count_by_rg(bam_fpaths, mapqx=None):
    do_mapqx = True if mapqx is not None else False
    counter_by_rg = {}
    for bam_fpath in bam_fpaths:
        bam = pysam.Samfile(bam_fpath, 'rb')
        readgroups = get_bam_readgroups(bam)
        if readgroups is None:
            bam_basename = os.path.splitext(os.path.basename(bam_fpath))[0]
            readgroups = [bam_basename]
        else:
            readgroups = [rg['ID'] for rg in readgroups]
        for readgroup in readgroups:
            counter = IntCounter({'unmapped': 0, 'mapped': 0})
            if do_mapqx:
                counter['bigger_mapqx'] = 0
            counter_by_rg[readgroup] = counter

        for read in bam:
            rg = get_rg_from_alignedread(read)
            if rg is None:
                rg = bam_basename
            if do_mapqx and read.mapq >= mapqx:
                counter_by_rg[rg]['bigger_mapqx'] += 1
            if read.is_unmapped:
                counter_by_rg[rg]['unmapped'] += 1
            else:
                counter_by_rg[rg]['mapped'] += 1
    return counter_by_rg
Exemple #12
0
    def test_distribution(self):
        'It tests the histogram function'

        ints_counter = self.create_test_counter()
        distrib = ints_counter.calculate_distribution(bins=10,
                                                      outlier_threshold=5)

        assert distrib['counts'] == [7L, 13L, 7L, 10L, 7L, 22L, 6L, 4L, 5L, 5L]
        assert distrib['bin_limits'] == [
            110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190
        ]
        assert 'average' in str(ints_counter)

        ints_counter = IntCounter({0: 2, 1: 1, 3: 1})
        result = [2, 1, 1]
        assert ints_counter.calculate_distribution(bins=3)['counts'] == result
Exemple #13
0
    def test_histogram_plotter(self):
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
        counter = IntCounter(values)
        counters = [counter]
        histo_ploter = HistogramPlotter(counters)
        assert len(histo_ploter.axes) == len(counters)
        fhand = NamedTemporaryFile(suffix='.png')
        histo_ploter.write_figure(fhand)
        # raw_input(fhand.name)

        # Add more intcounters
        counters.append(IntCounter(values))
        counters.append(IntCounter(values))
        histo_ploter = HistogramPlotter(counters,
                                        distrib_labels=['1', '2', '3'])
        assert len(histo_ploter.axes) == len(counters)
    def test_distribution(self):
        'It tests the histogram function'

        ints_counter = self.create_test_counter()
        distrib = ints_counter.calculate_distribution(bins=10,
                                                        outlier_threshold=5)

        assert distrib['counts'] == [7L, 13L, 7L, 10L, 7L, 22L, 6L, 4L, 5L,
                                      5L]
        assert distrib['bin_limits'] == [110, 118, 126, 134, 142, 150, 158,
                                         166, 174, 182, 190]
        assert 'average' in str(ints_counter)

        ints_counter = IntCounter({0: 2, 1: 1, 3: 1})
        result = [2, 1, 1]
        assert ints_counter.calculate_distribution(bins=3)['counts'] == result
Exemple #15
0
    def test_stats_functs(self):
        'It test the statistical functions of the class'
        ints = IntCounter({3: 1, 5: 1, 7: 2, 38: 1})
        assert ints.median == 7

        ints = IntCounter({3: 1, 5: 1, 7: 1, 38: 1})
        assert ints.median == 6

        # median with two middle numbers
        ext_array = IntCounter({3: 1, 5: 1, 7: 2})
        assert ext_array.median == 6

        ints = IntCounter({34: 1, 43: 1, 81: 1, 106: 2, 115: 1})
        assert ints.average - 80.83 < 0.01

        ext_counter = self.create_test_counter()
        assert ext_counter.median == 145
        assert round(ext_counter.average, 2) == 145.15

        assert ext_counter.sum == 13354
        assert ext_counter.count == 92
        assert round(ext_counter.variance, 2) == 557.43

        ints = IntCounter({3: 1, 4: 2, 5: 1, 6: 1, 8: 2})
        assert ints.median == 5
        assert ints.quartiles == (4, 5, 8)

        ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1})
        assert ints.quartiles == (1.5, 3, 4.5)

        ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1})
        assert ints.quartiles == (1.5, 3.5, 5.5)

        ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1})
        assert ints.quartiles == (2, 4, 6)

        ints = IntCounter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1})
        assert ints.quartiles == (2.5, 4.5, 6.5)
        assert ints.irq == 4.0
        assert ints.outlier_limits == (-3, 12)

        try:
            ints = IntCounter({0: 1, 1: 1, 2: 1})
            assert ints.quartiles
            self.fail('RuntimeError')
        except RuntimeError:
            pass
Exemple #16
0
 def __init__(self, bam_fpaths, coverage_threshold, window, min_mapq=None):
     self._coverage_threshold = coverage_threshold
     self._window = window
     self._min_mapq = min_mapq
     self._bam_coverage = BamCoverages2(bam_fpaths, min_mapq=min_mapq,
                                        window=window)
     self._scores = {samp: IntCounter()
                     for samp in self._bam_coverage.samples}
Exemple #17
0
    def __call__(self, snv):
        if self._first_snv:
            for call in snv.calls:
                self._scores[call.sample] = IntCounter()
            self._first_snv = False

        for call in snv.calls:
            if call.gt_qual is not None:
                self._scores[call.sample][int(call.gt_qual)] += 1
        return snv.remove_gt_from_low_qual_calls(min_qual=self._min_qual)
Exemple #18
0
    def create_test_counter():
        counter = IntCounter()
        d = {'9': '5', '10': '288', '11': '002556688', '12': '00012355555',
             '13': '0000013555688', '14': '00002555558',
             '15': '0000000000355555555557', '16': '000045', '17': '000055',
             '18': '0005', '19': '00005', '21': '5'}

        for key, values in d.items():
            for num in values:
                counter[int(key + num)] += 1
        return counter
Exemple #19
0
    def _calculate_complete_coverage_distrib(self, region):
        if region is None:
            chrom, start, end = None, None, None
        else:
            chrom, start, end = region

        min_mapq = self.min_mapq
        covs = {sample: IntCounter() for sample in self.samples}
        covs[None] = IntCounter()
        for bam in self._bams:
            columns = bam['bam'].pileup(reference=chrom, start=start, end=end,
                                        stepper=self.bam_pileup_stepper,
                                        truncate=True)
            one_sample, sample = self._if_one_sample_get_it(bam)
            for column in columns:
                col_counts = self._count_reads_in_column(column, min_mapq,
                                                         one_sample, sample)
                for sample, sample_cov in col_counts.items():
                    covs[sample][sample_cov] += 1
        return covs
Exemple #20
0
def get_genome_coverage(bam_fhands):
    coverage_hist = IntCounter()
    for bam_fhand in bam_fhands:
        bam_fpath = bam_fhand.name
        cmd = [get_binary_path('bedtools'), 'genomecov', '-ibam', bam_fpath]
        cover_process = Popen(cmd, stdout=PIPE)
        for line in cover_process.stdout:
            if line.startswith('genome'):
                cov, value = line.split('\t')[1:3]
                coverage_hist[int(cov)] += int(value)
    return coverage_hist
Exemple #21
0
    def test_draw_histogram_in_axes(self):
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
        fhand = NamedTemporaryFile(suffix='.png')
        counter = IntCounter(values)
        distrib = counter.calculate_distribution()
        axes, canvas = draw_histogram_in_axes(distrib['counts'],
                                              distrib['bin_limits'],
                                              kind=LINE,
                                              distrib_label='test')
        axes.legend()
        canvas.print_figure(fhand, format='png')
        fhand.flush()
        # raw_input(fhand.name)

        # ylimit test
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4, 0, 5, 4, 4, 4, 4, 4]
        fhand = NamedTemporaryFile(suffix='.png')
        counter = IntCounter(values)
        distrib = counter.calculate_distribution()
        axes, canvas = draw_histogram_in_axes(distrib['counts'],
                                              distrib['bin_limits'],
                                              kind=LINE,
                                              distrib_label='test',
                                              ylimits=(None, 4))
        axes.legend()
        canvas.print_figure(fhand, format='png')
        fhand.flush()
Exemple #22
0
def calculate_distance_distribution(interleave_fhand,
                                    index_fpath,
                                    max_clipping,
                                    max_distance=None,
                                    tempdir=None,
                                    threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params,
                          threads=threads)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)
    bamfile = AlignmentFile(bam_fhand.name)
    stats = {
        'outies': IntCounter(),
        'innies': IntCounter(),
        'others': IntCounter()
    }
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(
                mates[0], max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(
                    mates[1], max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats
Exemple #23
0
 def calculate_coverage_distrib_in_region(self, region=None):
     counts = {}
     for bam in self._bams:
         one_rg = True if len(bam['rgs']) < 2 else False
         for read_group in bam['rgs']:
             sample_field = self.bam_rg_field_for_vcf_sample
             sample = read_group[sample_field]
             if sample not in counts:
                 counts[sample] = IntCounter()
             for cov in self._get_coverages_in_bam_rg_win(
                     region, bam, read_group, one_rg):
                 counts[sample][int(round(cov))] += 1
     return counts
Exemple #24
0
 def _get_sample_counter(self, kind, sample=None, gt_broud_type=None):
     counters = self._sample_counters[kind]
     if sample is not None:
         if gt_broud_type is None:
             return counters[sample]
         else:
             return counters[sample][gt_broud_type]
     all_counters = IntCounter()
     for sample_counter in counters.values():
         if gt_broud_type is None:
             all_counters += sample_counter
         else:
             all_counters += sample_counter[gt_broud_type]
     return all_counters
Exemple #25
0
    def _count_reads(self):
        nreferences = self._bams[0].nreferences
        rpks = zeros(nreferences)
        references = []
        length_counts = IntCounter()

        first_bam = True
        n_reads = 0
        for bam in self._bams:
            if bam.nreferences != nreferences:
                msg = 'BAM files should have the same references'
                raise ValueError(msg)
            for index, count in enumerate(get_reference_counts(bam.filename)):
                n_reads += count['unmapped_reads'] + count['mapped_reads']
                if count['reference'] is None:
                    # some non-mapped reads have reference = None
                    continue
                kb_len = count['length'] / 1000
                rpk = count['mapped_reads'] / kb_len
                rpks[index] += rpk
                if first_bam:
                    # For the reference lengths we use the first BAM to make
                    references.append(count['reference'])
                    length_counts[count['length']] += 1
                else:
                    # the bams should be sorted with the references in the same
                    # order
                    if references[index] != count['reference']:
                        msg = 'The reference lengths do not match in the bams'
                        raise RuntimeError(msg)
            first_bam = False

        million_reads = n_reads / 1e6
        rpks /= million_reads  # rpkms
        self._rpkms = ArrayWrapper(rpks, bins=self._bins)

        abundant_refs = BestItemsKeeper(self._n_most_expressed_reads,
                                        izip(references, rpks),
                                        key=itemgetter(1))
        abundant_refs = [{
            'reference': i[0],
            'rpkm': i[1]
        } for i in abundant_refs]
        self._most_abundant_refs = abundant_refs

        self._lengths = length_counts
Exemple #26
0
 def test_value_for_index_test(self):
     'We can get the integer for a given index'
     # pylint: disable=W0212
     ints = IntCounter({3: 1, 5: 1, 7: 2, 38: 1})
     assert ints._get_value_for_index(0) == 3
     assert ints._get_value_for_index(1) == 5
     assert ints._get_value_for_index(2) == 7
     assert ints._get_value_for_index(3) == 7
     assert ints._get_value_for_index(4) == 38
     try:
         assert ints._get_value_for_index(5) == 38
         self.fail('IndexError expected')
     except IndexError:
         pass
Exemple #27
0
    def tests_draw_histograms(self):
        fhand = NamedTemporaryFile(suffix='.png')
        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
        counters = []
        counters.append(IntCounter(values))
        counters.append(IntCounter(values))
        counters.append(IntCounter(values))
        titles = ['t1', 't2', 't3']
        draw_histograms(counters, fhand, titles=titles, plots_per_chart=2)
        # raw_input(fhand.name)

        values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4, 0, 5, 4, 4, 4, 4, 4]
        counters = []
        counters.append(IntCounter(values))
        counters.append(IntCounter(values))
        counters.append(IntCounter(values))
        titles = ['t1', 't2', 't3']
        draw_histograms(counters,
                        fhand,
                        titles=titles,
                        plots_per_chart=2,
                        ylimits=(0, 14))
Exemple #28
0
    def __init__(self,
                 vcf_fpath,
                 gq_threshold=None,
                 dp_threshold=100,
                 min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS,
                 remarkable_coverages=None,
                 window_size=WINDOWS_SIZE):
        if remarkable_coverages is None:
            remarkable_depths = REMARKABLE_DEPTHS
        self.remarkable_depths = remarkable_depths

        self._reader = VCFReader(
            open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats)

        self._random_reader = pyvcfReader(filename=vcf_fpath)

        self.window_size = window_size
        self._gq_threshold = 0 if gq_threshold is None else gq_threshold

        self.dp_threshold = dp_threshold
        self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()}
        self._ac2d = _AlleleCounts2D()

        self.sample_dp_coincidence = {1: IntCounter()}
        for cov in remarkable_depths:
            self.sample_dp_coincidence[cov] = IntCounter()

        self.called_snvs = 0
        self.called_gts = IntCounter()

        # sample_counter
        self._sample_counters = {}

        for counter_name in SAMPLE_COUNTERS:
            if counter_name not in self._sample_counters:
                self._sample_counters[counter_name] = {}
            for sample in self._reader.samples:
                if counter_name in (GT_DEPTHS, GT_QUALS):
                    counters = {HOM: IntCounter(), HET: IntCounter()}
                else:
                    counters = IntCounter()
                self._sample_counters[counter_name][sample] = counters

        self._snv_counters = {
            MAFS: IntCounter(),
            MACS: IntCounter(),
            MAFS_DP: IntCounter(),
            SNV_QUALS: IntCounter(),
            HET_IN_SNP: IntCounter(),
            SNV_DENSITY: IntCounter(),
            INBREED_F_IN_SNP: IntCounter(),
            DEPTHS: IntCounter()
        }
        self._calculate()
    def test_sum_with_treshold_function():
        'It tests the function that calculates Q30 and Q20'

        quals = IntCounter({15: 10, 21: 13, 30: 12})
        assert quals.count_relative_to_value(20, operator.ge) == 25
        assert quals.count_relative_to_value(30, operator.ge) == 12
Exemple #30
0
def show_distances_distributions(bam_fpath, n=None, kind_of_interest=None):
    bamfile = pysam.Samfile(bam_fpath)
    type1 = 0
    type2a = []
    type2b = []
    type3a = []
    type3b = []
    type4 = []
    type5 = []
    type6 = 0
    others = 0
    h = 0

    #It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_by_reads(bamfile):
        if n is not None and h == n:
            break
        h += 1
        mates_alignments = _split_mates(grouped_mates)
        i = 0
        pair = []
        for alignments_group in mates_alignments:
            i += 1
            mate = _get_mate(i, mates_alignments)
            primary_mate = _get_primary_alignment(mate)
            primary_alignment = _get_primary_alignment(alignments_group)
            mates = [primary_alignment, primary_mate]
            if _read_is_totally_mapped(alignments_group, 0.05):
                if  primary_alignment.mate_is_unmapped:
                    kind = '1'
                else:
                    if primary_alignment.rname != primary_alignment.rnext:
                        kind = '6'
                    else:
                        if _mates_are_outies(mates):
                            kind = '3a'
                        elif _mates_are_innies(mates):
                            kind = '2a'
                        else:
                            kind = '5'
            else:
                fragment = _find_secondary_fragment(alignments_group, 5, 100)
                if fragment is not None:
                    fragments = [primary_alignment, fragment]
                    if (_alignments_in_same_ref([fragments[0], primary_mate])
                        or _alignments_in_same_ref([fragments[1], primary_mate])):
                        kind = '4'
                    else:
                        kind = 'other'
                else:
                    if primary_alignment.is_unmapped:
                        kind = '1'
                    else:
                        if primary_alignment.rname == primary_alignment.rnext:
                            if _mates_are_outies(mates):
                                kind = '3b'
                            elif _mates_are_innies(mates):
                                kind = '2b'
                            else:
                                kind = '5'
                        else:
                            kind = '6'
            pair.append(kind)
        if '1' in pair:
            type1 += 1
        elif '6' in pair:
            type6 += 1
        elif 'other' in pair:
            others += 1
        else:
            distance = _find_distance(mates)
            if '4' in pair:
                type4.append(distance)
            elif '2b' in pair:
                type2b.append(distance)
            elif '3b' in pair:
                type3b.append(distance)
            elif '2a' in pair:
                type2a.append(distance)
            elif '3a' in pair:
                type3a.append(distance)
            elif '5' in pair:
                type5.append(distance)
    stats1 = {'1': type1, '6': type6, 'other': others}
    stats2 = {'4': type4, '2b': type2b, '3b': type3b, '2a': type2a,
              '3a': type3a, '5': type5}
    for key in stats1.keys():
        print key.ljust(5), stats1[key]
    for key in stats2.keys():
        print key.ljust(5), len(stats2[key])
    for key in stats2.keys():
        if key in kind_of_interest:
            print key, 'distance distribution'
            counter = IntCounter(iter(stats2[key]))
            distribution = counter.calculate_distribution(remove_outliers=True)
            counts = distribution['counts']
            bin_limits = distribution['bin_limits']
            print draw_histogram(bin_limits, counts)
Exemple #31
0
    def test_sum_with_treshold_function():
        'It tests the function that calculates Q30 and Q20'

        quals = IntCounter({15: 10, 21: 13, 30: 12})
        assert quals.count_relative_to_value(20, operator.ge) == 25
        assert quals.count_relative_to_value(30, operator.ge) == 12
Exemple #32
0
 def __init__(self, bam_fhands, mapqs=MAPQS_TO_CALCULATE):
     self._bam_fhands = bam_fhands
     self.mapqs_to_calculate = mapqs
     self._counters = {mapq: IntCounter() for mapq in mapqs}
     self._calculate()
Exemple #33
0
 def __init__(self, bams):
     # TODO flag, read_group
     self._bams = bams
     self._mapqs = IntCounter()
     self._flag_counts = {}
     self._count_mapqs()
Exemple #34
0
 def test_n50_calculation():
     'It calculates N50.'
     assert calculate_nx(IntCounter([2, 2, 2, 3, 3, 4, 8, 8]), 50) == 8
     assert calculate_nx(IntCounter([2, 2, 2, 3, 3, 4, 8, 8]), 95) == 2
     assert calculate_nx(IntCounter([8, 8, 8, 8, 8, 8, 8, 8]), 50) == 8
     assert calculate_nx(IntCounter(), 50) is None
Exemple #35
0
 def test_draw_histogram_in_fhand(self):
     values = [1, 2, 3, 1, 2, 3, 2, 3, 2, 3, 2, 1, 4]
     fhand = NamedTemporaryFile(suffix='.png')
     counter = IntCounter(values)
     draw_histogram_in_fhand(counter, fhand=fhand)