def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) loci = [ Locus.from_inclusive_coordinates("20", 10008951), # 0 Locus.from_inclusive_coordinates("20", 10009053), # 1 Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 Locus.from_inclusive_coordinates("20", 10006822), # 3 Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 ] evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1), ('C', 4)]) eq_( evidence.filter(drop_duplicates=True).match_summary( Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)]) eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3), ('C', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3), ('CC', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)]) eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2), ('', 6)]) eq_( evidence.match_summary( Variant(loci[4], "AC", ""), lambda e: e.read_attributes().mapping_quality.mean()), [('AC', 60.0), ('', 65.0)])
def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant(): # Try native varcode Variant. handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) locus = Locus.from_inclusive_coordinates("20", 10008951) variant = VarcodeVariant( locus.contig, locus.position + 1, # inclusive not interbase "A", "C") evidence = PileupCollection.from_bam(handle, [variant]) eq_(evidence.match_summary(variant), [('A', 1), ('C', 4)])
def from_bam(pysam_samfile, loci): """ Create a PileupCollection for a set of loci from a BAM file. Parameters ---------- pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string to a BAM file. The BAM file must be indexed. loci : list of Locus instances Loci to collect pileups for. Returns ---------- PileupCollection instance containing pileups for the specified loci. All alignments in the BAM file are included (e.g. duplicate reads, secondary alignments, etc.). See `PileupCollection.filter` if these need to be removed. """ loci = [to_locus(obj) for obj in loci] close_on_completion = False if typechecks.is_string(pysam_samfile): pysam_samfile = Samfile(pysam_samfile) close_on_completion = True try: # Map from pyensembl normalized chromosome names used in Variant to # the names used in the BAM file. chromosome_name_map = {} for name in pysam_samfile.references: normalized = pyensembl.locus.normalize_chromosome(name) chromosome_name_map[normalized] = name result = PileupCollection({}) # Optimization: we sort variants so our BAM reads are localized. locus_iterator = itertools.chain.from_iterable( (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions) for locus_interval in sorted(loci) ) for locus in locus_iterator: result.pileups[locus] = Pileup(locus, []) try: chromosome = chromosome_name_map[locus.contig] except KeyError: logging.warn("No such contig in bam: %s" % locus.contig) continue columns = pysam_samfile.pileup( chromosome, locus.position, locus.position + 1, # exclusive, 0-indexed truncate=True, stepper="nofilter", ) try: column = next(columns) except StopIteration: # No reads align to this locus. continue # Note that storing the pileups here is necessary, since the # subsequent assertion will invalidate our column. pileups = column.pileups assert list(columns) == [] # column is invalid after this. for pileup_read in pileups: if not pileup_read.is_refskip: element = PileupElement.from_pysam_alignment(locus, pileup_read) result.pileups[locus].append(element) return result finally: if close_on_completion: pysam_samfile.close()
def from_bam(pysam_samfile, loci, normalized_contig_names=True): ''' Create a PileupCollection for a set of loci from a BAM file. Parameters ---------- pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string to a BAM file. The BAM file must be indexed. loci : list of Locus instances Loci to collect pileups for. normalized_contig_names : whether the contig names have been normalized (e.g. pyensembl removes the 'chr' prefix). Set to true to de-normalize the names when querying the BAM file. Returns ---------- PileupCollection instance containing pileups for the specified loci. All alignments in the BAM file are included (e.g. duplicate reads, secondary alignments, etc.). See `PileupCollection.filter` if these need to be removed. ''' loci = [to_locus(obj) for obj in loci] close_on_completion = False if typechecks.is_string(pysam_samfile): pysam_samfile = Samfile(pysam_samfile) close_on_completion = True try: # Map from pyensembl normalized chromosome names used in Variant to # the names used in the BAM file. if normalized_contig_names: chromosome_name_map = {} for name in pysam_samfile.references: normalized = pyensembl.locus.normalize_chromosome(name) chromosome_name_map[normalized] = name chromosome_name_map[name] = name else: chromosome_name_map = None result = PileupCollection({}) # Optimization: we sort variants so our BAM reads are localized. locus_iterator = itertools.chain.from_iterable( (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions) for locus_interval in sorted(loci)) for locus in locus_iterator: result.pileups[locus] = Pileup(locus, []) if normalized_contig_names: try: chromosome = chromosome_name_map[locus.contig] except KeyError: logging.warn("No such contig in bam: %s" % locus.contig) continue else: chromosome = locus.contig columns = pysam_samfile.pileup( chromosome, locus.position, locus.position + 1, # exclusive, 0-indexed truncate=True, stepper="nofilter") try: column = next(columns) except StopIteration: # No reads align to this locus. continue # Note that storing the pileups here is necessary, since the # subsequent assertion will invalidate our column. pileups = column.pileups assert list(columns) == [] # column is invalid after this. for pileup_read in pileups: if not pileup_read.is_refskip: element = PileupElement.from_pysam_alignment( locus, pileup_read) result.pileups[locus].append(element) return result finally: if close_on_completion: pysam_samfile.close()
def test_read_evidence_gatk_mini_bundle_extract(): loci = [ Locus.from_inclusive_coordinates("20", 9999996, 9999996), # 0 Locus.from_inclusive_coordinates("20", 10260442), # 1 Locus.from_inclusive_coordinates("20", 10006823), # 2 Locus.from_inclusive_coordinates("20", 10006819, 10006823), # 3 Locus.from_inclusive_coordinates("20", 10006819, 10006825), # 4 Locus.from_inclusive_coordinates("20", 10006822, 10006827), # 5 Locus.from_inclusive_coordinates("20", 10007175), # 6 Locus.from_inclusive_coordinates("20", 10007174, 10007176), # 7 Locus.from_inclusive_coordinates("20", 1, 3), # 8 Locus.from_inclusive_coordinates("20", 10008796), # 9 Locus.from_inclusive_coordinates("20", 10008921), # 10 ] handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.allele_summary(loci[0]), [("ACT", 9)]) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[0]), [("ACT", 8)]) eq_(evidence.allele_summary(loci[1]), [("T", 7)]) eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)]) eq_( evidence.filter(drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]), []) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[2]), [("", 5), ("C", 1)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=60).allele_summary(loci[2]), [("", 5), ("C", 1)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[2]), [("", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)]) eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)]) eq_( evidence.filter(min_base_quality=30).allele_summary(loci[6]), [("T", 4), ("C", 3)]) eq_(evidence.filter().allele_summary(loci[7]), [("CTT", 5), ("CCT", 3)]) eq_( evidence.filter(min_base_quality=30).allele_summary(loci[7]), [("CTT", 3), ("CCT", 2)]) eq_( evidence.filter(min_base_quality=32).allele_summary(loci[2]), [("", 6), ("C", 1)]) eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)), {'20GAVAAXX100126:4:3:18352:43857'}) eq_(evidence.allele_summary(loci[8]), []) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), []) assert_raises(KeyError, evidence.allele_summary, Locus.from_inclusive_coordinates("20", 10009174, 10009176)) eq_( filtered_read_names( evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)), {'20FUKAAXX100202:8:68:1530:49310'}) eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0) eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')), list(evidence.at(loci[9]).read_attributes().mapping_quality)) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[10]), [('C', 2), ('CA', 1), ('CAA', 1)]) eq_( evidence.filter(drop_duplicates=True).allele_summary( Locus.from_interbase_coordinates(loci[10].contig, loci[10].start, loci[10].start)), [('', 2), ('A', 1), ('AA', 1)])