Esempio n. 1
0
def test_read_evidence_variant_matching_gatk_mini_bundle_extract():
    handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))

    loci = [
        Locus.from_inclusive_coordinates("20", 10008951),  # 0
        Locus.from_inclusive_coordinates("20", 10009053),  # 1
        Locus.from_inclusive_coordinates("20", 10009053, 10009054),  # 2
        Locus.from_inclusive_coordinates("20", 10006822),  # 3
        Locus.from_inclusive_coordinates("20", 10006822, 10006823),  # 4
    ]
    evidence = PileupCollection.from_bam(handle, loci)

    eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1),
                                                             ('C', 4)])
    eq_(
        evidence.filter(drop_duplicates=True).match_summary(
            Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3),
                                                             ('C', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3),
                                                              ('CC', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3),
                                                             ('', 0)])
    eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)])
    eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2),
                                                             ('', 6)])
    eq_(
        evidence.match_summary(
            Variant(loci[4], "AC", ""),
            lambda e: e.read_attributes().mapping_quality.mean()),
        [('AC', 60.0), ('', 65.0)])
Esempio n. 2
0
def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant():
    # Try native varcode Variant.
    handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))
    locus = Locus.from_inclusive_coordinates("20", 10008951)
    variant = VarcodeVariant(
        locus.contig,
        locus.position + 1,  # inclusive not interbase
        "A",
        "C")
    evidence = PileupCollection.from_bam(handle, [variant])
    eq_(evidence.match_summary(variant), [('A', 1), ('C', 4)])
Esempio n. 3
0
    def from_bam(pysam_samfile, loci):
        """
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        """

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            chromosome_name_map = {}
            for name in pysam_samfile.references:
                normalized = pyensembl.locus.normalize_chromosome(name)
                chromosome_name_map[normalized] = name

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions)
                for locus_interval in sorted(loci)
            )
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                try:
                    chromosome = chromosome_name_map[locus.contig]
                except KeyError:
                    logging.warn("No such contig in bam: %s" % locus.contig)
                    continue
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter",
                )
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()
Esempio n. 4
0
    def from_bam(pysam_samfile, loci, normalized_contig_names=True):
        '''
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        normalized_contig_names : whether the contig names have been normalized
            (e.g. pyensembl removes the 'chr' prefix). Set to true to
            de-normalize the names when querying the BAM file.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        '''

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            if normalized_contig_names:
                chromosome_name_map = {}
                for name in pysam_samfile.references:
                    normalized = pyensembl.locus.normalize_chromosome(name)
                    chromosome_name_map[normalized] = name
                    chromosome_name_map[name] = name
            else:
                chromosome_name_map = None

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos)
                 for pos in locus_interval.positions)
                for locus_interval in sorted(loci))
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                if normalized_contig_names:
                    try:
                        chromosome = chromosome_name_map[locus.contig]
                    except KeyError:
                        logging.warn("No such contig in bam: %s" %
                                     locus.contig)
                        continue
                else:
                    chromosome = locus.contig
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter")
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(
                            locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()
Esempio n. 5
0
def test_read_evidence_gatk_mini_bundle_extract():
    loci = [
        Locus.from_inclusive_coordinates("20", 9999996, 9999996),  # 0
        Locus.from_inclusive_coordinates("20", 10260442),  # 1
        Locus.from_inclusive_coordinates("20", 10006823),  # 2
        Locus.from_inclusive_coordinates("20", 10006819, 10006823),  # 3
        Locus.from_inclusive_coordinates("20", 10006819, 10006825),  # 4
        Locus.from_inclusive_coordinates("20", 10006822, 10006827),  # 5
        Locus.from_inclusive_coordinates("20", 10007175),  # 6
        Locus.from_inclusive_coordinates("20", 10007174, 10007176),  # 7
        Locus.from_inclusive_coordinates("20", 1, 3),  # 8
        Locus.from_inclusive_coordinates("20", 10008796),  # 9
        Locus.from_inclusive_coordinates("20", 10008921),  # 10
    ]
    handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))
    evidence = PileupCollection.from_bam(handle, loci)

    eq_(evidence.allele_summary(loci[0]), [("ACT", 9)])
    eq_(
        evidence.filter(drop_duplicates=True).allele_summary(loci[0]),
        [("ACT", 8)])
    eq_(evidence.allele_summary(loci[1]), [("T", 7)])
    eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_base_quality=50).allele_summary(loci[2]), [])
    eq_(
        evidence.filter(drop_duplicates=True).allele_summary(loci[2]),
        [("", 5), ("C", 1)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_mapping_quality=60).allele_summary(loci[2]),
        [("", 5), ("C", 1)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_mapping_quality=61).allele_summary(loci[2]),
        [("", 2)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_mapping_quality=61).allele_summary(loci[3]),
        [("A", 2)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_mapping_quality=61).allele_summary(loci[4]),
        [("AAA", 2)])
    eq_(
        evidence.filter(drop_duplicates=True,
                        min_mapping_quality=61).allele_summary(loci[5]),
        [("AAAC", 2)])
    eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)])
    eq_(
        evidence.filter(min_base_quality=30).allele_summary(loci[6]),
        [("T", 4), ("C", 3)])
    eq_(evidence.filter().allele_summary(loci[7]), [("CTT", 5), ("CCT", 3)])
    eq_(
        evidence.filter(min_base_quality=30).allele_summary(loci[7]),
        [("CTT", 3), ("CCT", 2)])
    eq_(
        evidence.filter(min_base_quality=32).allele_summary(loci[2]),
        [("", 6), ("C", 1)])
    eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)),
        {'20GAVAAXX100126:4:3:18352:43857'})
    eq_(evidence.allele_summary(loci[8]), [])
    eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), [])
    assert_raises(KeyError, evidence.allele_summary,
                  Locus.from_inclusive_coordinates("20", 10009174, 10009176))
    eq_(
        filtered_read_names(
            evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)),
        {'20FUKAAXX100202:8:68:1530:49310'})
    eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0)
    eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')),
        list(evidence.at(loci[9]).read_attributes().mapping_quality))
    eq_(
        evidence.filter(drop_duplicates=True).allele_summary(loci[10]),
        [('C', 2), ('CA', 1), ('CAA', 1)])
    eq_(
        evidence.filter(drop_duplicates=True).allele_summary(
            Locus.from_interbase_coordinates(loci[10].contig, loci[10].start,
                                             loci[10].start)), [('', 2),
                                                                ('A', 1),
                                                                ('AA', 1)])