def __init__(self, bed, gtf_file=None, names=False): self.regiontagger = None if gtf_file: self.regiontagger = RegionTagger(gtf_file) self.total = 0 self.size = 0 self.lengths = Counts() self.refs = {} self.names = {} for region in bed: self.total += 1 self.size += (region.end - region.start) self.lengths.add(region.end - region.start) if names: if not region.name in self.names: self.names[region.name] = 1 else: self.names[region.name] += 1 if not region.chrom in self.refs: self.refs[region.chrom] = 0 self.refs[region.chrom] += 1 if self.regiontagger: self.regiontagger.add_region(region.chrom, region.start, region.end, region.strand)
def __init__(self, bamfile, gtf=None, region=None, delim=None, tags=[], show_all=False): regiontagger = None flag_counts = FlagCounts() ref = None start = None end = None if gtf: regiontagger = RegionTagger(gtf, bamfile.references, only_first_fragment=True) if region: ref, startend = region.rsplit(':', 1) if '-' in startend: start, end = [int(x) for x in startend.split('-')] start = start - 1 sys.stderr.write('Region: %s:%s-%s\n' % (ref, start + 1, end)) else: start = int(startend) - 1 end = int(startend) sys.stderr.write('Region: %s:%s\n' % (ref, start + 1)) total = 0 mapped = 0 unmapped = 0 tlen_counts = {} names = set() refs = {} tagbins = {} for tag in tags: tagbins[tag] = FeatureBin(tag) for rname in bamfile.references: if delim: refs[rname.split(delim)[0]] = 0 else: refs[rname] = 0 # setup region or whole-file readers def _foo1(): for read in bamfile.fetch(ref, start, end): yield read def _foo2(): for read in bam_iter(bamfile): yield read if region: read_gen = _foo1 else: read_gen = _foo2 has_ih = True has_nh = True try: for read in read_gen(): if not show_all and read.is_paired and not read.is_read1: # only operate on the first fragment continue try: if has_ih and read.opt('IH') > 1: if read.qname in names: # reads only count once for this... continue names.add(read.qname) except KeyError: if not read.is_unmapped: has_ih = False #missing IH tag - ignore pass try: if has_nh and read.opt('NH') > 1: if read.qname in names: # reads only count once for this... continue names.add(read.qname) except KeyError: if not read.is_unmapped: has_nh = False #missing NH tag - ignore pass flag_counts.add(read.flag) total += 1 if read.is_unmapped: unmapped += 1 continue mapped += 1 if read.is_proper_pair and read.tid == read.mrnm: # we don't care about reads that don't map to the same reference # note: this doesn't work for RNA mapped to a reference genome... # for RNA, you'd need to map to a transcript library (refseq) to get # an accurate template length # # just skipping 'N' cigar values won't cut it either... since the pairs # will likely silently span a gap. if read.is_reverse: k = -read.tlen else: k = read.tlen if not k in tlen_counts: tlen_counts[k] = 1 else: tlen_counts[k] += 1 if delim: refs[bamfile.getrname(read.rname).split(delim)[0]] += 1 else: refs[bamfile.getrname(read.rname)] += 1 if regiontagger: regiontagger.add_read(read, bamfile.getrname(read.rname)) for tag in tagbins: tagbins[tag].add(read) except KeyboardInterrupt: sys.stderr.write('*** Interrupted - displaying stats up to this point! ***\n\n') self.total = total self.mapped = mapped self.unmapped = unmapped self.flag_counts = flag_counts self.tagbins = tagbins self.refs = refs self.regiontagger = regiontagger self.tlen_counts = tlen_counts