Esempio n. 1
0
def bam_find_regions(bam_name, merge_distance=10, min_read_count=2, only_uniq_starts=False, nostrand=False, out=sys.stdout):
    bamfile = bam_open(bam_name)
    region_plus = None
    region_minus = None

    for pileup in bam_pileup_iter(bamfile, mask=1540):
        chrom = bamfile.getrname(pileup.tid)

        for read in pileup.pileups:
            if read.is_del:
                continue
            if nostrand or not read.alignment.is_reverse:
                if not region_plus or region_plus.chrom != chrom or (region_plus.end + merge_distance) < pileup.pos:
                    if region_plus and region_plus.read_count >= min_read_count:
                        region_plus.write(out)

                    region_plus = ExpressedRegion(chrom, only_uniq_starts)
                region_plus.add_column(read, pileup.pos)
            else:
                if not region_minus or region_minus.chrom != chrom or (region_minus.end + merge_distance) < pileup.pos:
                    if region_minus and region_minus.read_count >= min_read_count:
                        region_minus.write(out)

                    region_minus = ExpressedRegion(chrom, only_uniq_starts)
                region_minus.add_column(read, pileup.pos)

    if region_plus and region_plus.read_count >= min_read_count:
        region_plus.write(out)
    if region_minus and region_minus.read_count >= min_read_count:
        region_minus.write(out)

    bamfile.close()
Esempio n. 2
0
def bam_tofastx(fname,
                colorspace=False,
                show_mapped=True,
                show_unmapped=True,
                fastq=True,
                read1=True,
                read2=True,
                proper=False):
    if show_mapped is False and show_unmapped is False:
        return

    sam = bam_open(fname)

    last_key = None

    for read in bam_iter(sam):
        if not read1 and read.is_read1:
            continue
        if not read2 and read.is_read2:
            continue

        if proper and not read.is_proper_pair:
            continue

        k = (read.qname, read.seq)
        if last_key == k:
            continue

        show = False
        if show_mapped and not read.is_unmapped:
            show = True
        if show_unmapped and read.is_unmapped:
            show = True

        if not show:
            continue

        if fastq:
            write_fastq(read, colorspace=colorspace)
        else:
            write_fasta(read, colorspace=colorspace)

        last_key = k
Esempio n. 3
0
def bam_find_regions(bam_name,
                     merge_distance=10,
                     min_read_count=2,
                     only_uniq_starts=False,
                     nostrand=False,
                     out=sys.stdout):
    bamfile = bam_open(bam_name)
    region_plus = None
    region_minus = None

    for pileup in bam_pileup_iter(bamfile, mask=1540):
        chrom = bamfile.getrname(pileup.tid)

        for read in pileup.pileups:
            if read.is_del:
                continue
            if nostrand or not read.alignment.is_reverse:
                if not region_plus or region_plus.chrom != chrom or (
                        region_plus.end + merge_distance) < pileup.pos:
                    if region_plus and region_plus.read_count >= min_read_count:
                        region_plus.write(out)

                    region_plus = ExpressedRegion(chrom, only_uniq_starts)
                region_plus.add_column(read, pileup.pos)
            else:
                if not region_minus or region_minus.chrom != chrom or (
                        region_minus.end + merge_distance) < pileup.pos:
                    if region_minus and region_minus.read_count >= min_read_count:
                        region_minus.write(out)

                    region_minus = ExpressedRegion(chrom, only_uniq_starts)
                region_minus.add_column(read, pileup.pos)

    if region_plus and region_plus.read_count >= min_read_count:
        region_plus.write(out)
    if region_minus and region_minus.read_count >= min_read_count:
        region_minus.write(out)

    bamfile.close()
Esempio n. 4
0
def bam_tofastx(fname, colorspace=False, show_mapped=True, show_unmapped=True, fastq=True, read1=True, read2=True, proper=False):
    if show_mapped is False and show_unmapped is False:
        return

    sam = bam_open(fname)

    last_key = None

    for read in bam_iter(sam):
        if not read1 and read.is_read1:
            continue
        if not read2 and read.is_read2:
            continue

        if proper and not read.is_proper_pair:
            continue

        k = (read.qname, read.seq)
        if last_key == k:
            continue

        show = False
        if show_mapped and not read.is_unmapped:
            show = True
        if show_unmapped and read.is_unmapped:
            show = True

        if not show:
            continue

        if fastq:
            write_fastq(read, colorspace=colorspace)
        else:
            write_fasta(read, colorspace=colorspace)

        last_key = k
Esempio n. 5
0
def usage():
    print __doc__
    print """\
Usage: bamutils peakheight {options} bamfile peaks.bed
"""
    sys.exit(1)

if __name__ == "__main__":
    bam_fname = None
    bed_fname = None

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        elif not bam_fname and os.path.exists(arg):
            bam_fname = arg
        elif not bed_fname and os.path.exists(arg):
            bed_fname = arg
        else:
            print 'Unknown argument: %s' % arg
            usage()
    if not bam_fname or not bed_fname:
        usage()

    bam = bam_open(bam_fname)
    with open(bed_fname) as f:
        bam_peakheight(bam, f)

    bam.close()
Esempio n. 6
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')
Esempio n. 7
0
                    usage()
            elif not ref and os.path.exists(arg) and os.path.exists('%s.fai' % arg):
                if os.path.exists('%s.fai' % arg):
                    ref = arg
                else:
                    print "Missing FAI index on %s" % arg
                    usage()
            elif not regions:
                regions = BedFile(region=arg)
            else:
                print "Unknown option or missing index: %s" % arg
                usage()
    except Exception, e:
        print e
        usage()

    if not bam:
        usage()
    else:
        bamobj = bam_open(bam)
        if profile:
            import cProfile

            def func():
                bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, TimedProfiler())
            sys.stderr.write('Profiling...\n')
            cProfile.run('func()', profile)
        else:
                bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, None)
        bamobj.close()
Esempio n. 8
0
                    usage()
            elif not ref and os.path.exists(arg) and os.path.exists('%s.fai' % arg):
                if os.path.exists('%s.fai' % arg):
                    ref = arg
                else:
                    print "Missing FAI index on %s" % arg
                    usage()
            elif not regions:
                regions = BedFile(region=arg)
            else:
                print "Unknown option or missing index: %s" % arg
                usage()
    except Exception, e:
        print e
        usage()

    if not bam:
        usage()
    else:
        bamobj = bam_open(bam)
        if profile:
            import cProfile

            def func():
                bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, TimedProfiler())
            sys.stderr.write('Profiling...\n')
            cProfile.run('func()', profile)
        else:
                bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, None)
        bamobj.close()
Esempio n. 9
0
Region should be: chr:start-end (start 1-based)

"""
    sys.exit(1)

if __name__ == "__main__":
    fname = None
    ref = None
    start = None
    end = None

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        elif not fname:
            if os.path.exists(arg):
                fname = arg
            else:
                usage("%s doesn't exist!")
        else:
            chrom, se = arg.split(':')
            start, end = [int(x) for x in se.split('-')]
            start = start - 1

    if not fname:
        usage()

    bamfile = bam_open(fname)
    bam_junction_count(bamfile, ref, start, end)
    bamfile.close()
Esempio n. 10
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')
Esempio n. 11
0
            usage()
        elif not bamfile:
            if not os.path.exists(arg):
                usage("Missing or non-existant bamfile: %s" % arg)
            if not os.path.exists("%s.bai" % arg):
                usage("Missing bam index (bai) file: %s" % arg)

            bamfile = arg

    if not model or not model_arg:
        usage("Missing model! Must include one of: %s" % ", ".join(count.models))
    elif not bamfile:
        usage("Missing BAM file!")

    modelobj = count.models[model](model_arg)
    bam = bam_open(bamfile)
    modelobj.count(
        bam,
        stranded,
        coverage,
        uniq_only,
        fpkm,
        norm,
        multiple,
        whitelist,
        blacklist,
        rev_read2=rev_read2,
        start_only=startonly,
    )
    bam.close()
Esempio n. 12
0
            last = arg
        elif arg in ['-norm', '-multiple', '-whitelist', '-blacklist', '-library']:
            last = arg
        elif arg == '-startonly':
            startonly = True
        elif arg == '-coverage':
            coverage = True
        elif arg == '-fpkm':
            fpkm = True
        elif arg == '-uniq':
            uniq_only = True
        elif arg == '-h':
            usage()
        elif not bamfile:
            if not os.path.exists(arg):
                usage('Missing or non-existant bamfile: %s' % arg)
            if not os.path.exists('%s.bai' % arg):
                usage('Missing bam index (bai) file: %s' % arg)

            bamfile = arg

    if not model or not model_arg:
        usage('Missing model! Must include one of: %s' % ', '.join(count.models))
    elif not bamfile:
        usage('Missing BAM file!')

    modelobj = count.models[model](model_arg)
    bam = bam_open(bamfile)
    modelobj.count(bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, start_only=startonly)
    bam.close()