def bam_innerdist(bam1, bam2, summaryout=None): iter1 = bam_iter(bam1) iter2 = bam_iter(bam2, quiet=True) distances = {} total = 0 proper = 0 orientation_count = {"+/-": 0, "-/+": 0, "+/+": 0, "-/-": 0} read1_last = None read2_last = None read1 = None read2 = None while True: try: while not read1 or read1_last == read1.qname: read1 = iter1.next() while not read2 or read2_last == read2.qname: read2 = iter2.next() except StopIteration: break if read1.qname != read2.qname: raise ValueError("Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname)) read1_last = read1.qname read2_last = read2.qname total += 1 if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid: continue proper += 1 if read1.pos < read2.pos: dist = read2.pos - read1.aend else: dist = read1.pos - read2.aend if summaryout: summaryout.write("%s\n" % dist) if not dist in distances: distances[dist] = 1 else: distances[dist] += 1 orientation = "%s/%s" % ("-" if read1.is_reverse else "+", "-" if read2.is_reverse else "+") orientation_count[orientation] += 1 mean, stdev = counts_mean_stdev(distances) return total, proper, mean, stdev, orientation_count
def bam_innerdist(bam1, bam2, summaryout=None): iter1 = bam_iter(bam1) iter2 = bam_iter(bam2, quiet=True) distances = {} total = 0 proper = 0 orientation_count = { '+/-': 0, '-/+': 0, '+/+': 0, '-/-': 0, } read1_last = None read2_last = None read1 = None read2 = None while True: try: while not read1 or read1_last == read1.qname: read1 = iter1.next() while not read2 or read2_last == read2.qname: read2 = iter2.next() except StopIteration: break if read1.qname != read2.qname: raise ValueError( "Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname)) read1_last = read1.qname read2_last = read2.qname total += 1 if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid: continue proper += 1 if read1.pos < read2.pos: dist = read2.pos - read1.aend else: dist = read1.pos - read2.aend if summaryout: summaryout.write('%s\n' % dist) if not dist in distances: distances[dist] = 1 else: distances[dist] += 1 orientation = '%s/%s' % ('-' if read1.is_reverse else '+', '-' if read2.is_reverse else '+') orientation_count[orientation] += 1 mean, stdev = counts_mean_stdev(distances) return total, proper, mean, stdev, orientation_count
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True): if gtf_file: gtf = GTF(gtf_file) else: gtf = None sys.stderr.write('Calculating Read stats...\n') stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles] sys.stdout.write('\t') for fname, stat in zip(infiles, stats): sys.stdout.write('%s\t\t' % fname) sys.stdout.write('\n') sys.stdout.write('Reads:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.total) sys.stdout.write('\n') sys.stdout.write('Mapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.mapped) sys.stdout.write('\n') sys.stdout.write('Unmapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.unmapped) sys.stdout.write('\n') sys.stdout.write('\nFlag distribution\n') validflags = set() maxsize = 0 for flag in flag_descriptions: for stat in stats: if stat.flag_counts.counts[flag] > 0: validflags.add(flag) maxsize = max(maxsize, len(flag_descriptions[flag])) for flag in sorted(validflags): sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag])) for stat in stats: sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total))) sys.stdout.write('\n') sys.stdout.write('\n') if stats[0].tlen_counts: sys.stdout.write('Template length:') for stat in stats: mean, stdev = counts_mean_stdev(stat.tlen_counts) sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev)) sys.stdout.write('\n') sys.stdout.write('\n') stat_tags = {} for tag in stats[0].tagbins: stat_tags[tag] = [] for stat in stats: stat_tags[tag].append(stat.tagbins[tag]) for tag in stat_tags: asc = stats[0].tagbins[tag].asc sys.stdout.write("Ave %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.mean) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write("Max %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.max) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write('%s distribution:\n' % tag) gens = [] gen_vals = [] last_pcts = [] for stat in stats: gens.append(stat.distribution_gen(tag)) gen_vals.append(None) last_pcts.append(0.0) good = True last = None while good: good = False for i, stat in enumerate(stats): if not gen_vals[i]: try: gen_vals[i] = gens[i].next() except StopIteration: pass vals = [tup[0] for tup in gen_vals if tup] if not vals: continue if asc: minval = min(vals) else: minval = max(vals) if last and type(last) == int and fillin_stats: if asc: last += 1 # fill in missing values while last < minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last += 1 else: last -= 1 # fill in missing values while last > minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last -= 1 last = minval sys.stdout.write(str(minval)) for i, tup in enumerate(gen_vals): if tup and tup[0] == minval: sys.stdout.write('\t%s\t%s' % (tup[1], tup[2])) last_pcts[i] = tup[2] gen_vals[i] = None good = True else: sys.stdout.write('\t0\t%s' % (last_pcts[i])) sys.stdout.write('\n') sys.stdout.write('\n') sys.stdout.write('Reference counts') for stat in stats: sys.stdout.write('\tcount\t') sys.stdout.write('\n') for k in sorted([x for x in stats[0].refs]): sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t' % stat.refs[k]) sys.stdout.write('\n') if gtf_file: sys.stdout.write('Mapping regions') for stat in stats: sys.stdout.write('\tcount\tCPM') sys.stdout.write('\n') sorted_keys = [x for x in stats[0].regiontagger.counts] sorted_keys.sort() for k in sorted_keys: sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000)) sys.stdout.write('\n')