Exemple #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        "--mask-gff-file",
        dest="filename_bed",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="ignore_masked_reads",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option("--output-readmap",
                      dest="output_readmap",
                      action="store_true",
                      help="output map between read name and "
                      "md5/base64 encoded short name[%default]")

    parser.add_option(
        "--add-alignment-details",
        dest="add_alignment_details",
        action="store_true",
        help=
        "add alignment details to per-read details. Implies --output-details "
        "[%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.add_option(
        "--basic-counts",
        dest="detailed_count",
        action="store_false",
        help="perform basic counting and do not compute per read stats. "
        "This is more memory efficient and faster stats computation, "
        "but only a summary counts table is output [%default]")

    parser.set_defaults(
        filename_bed=None,
        ignore_masked_reads=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        detailed_count=True,
        output_details=False,
        output_readmap=False,
        add_alignment_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.filename_bed:
        bed_mask = GTF.readAndIndex(
            GTF.iterator(iotools.open_file(options.filename_bed)))
    else:
        bed_mask = None

    if options.add_alignment_details:
        options.output_details = True

    is_stdin = True
    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
        if args[0] != "-":
            is_stdin = False
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")
        if options.stdin != "-":
            is_stdin = False

    if options.output_details:
        outfile_details = E.open_output_file("details", "w")
    else:
        outfile_details = None

    if options.output_readmap:
        outfile_readmap = E.open_output_file("readmap", "w")
    else:
        outfile_readmap = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \
        bam2stats_count(pysam_in,
                        bed_mask=bed_mask,
                        ignore_masked_reads=options.ignore_masked_reads,
                        is_stdin=is_stdin,
                        filename_fastq=options.filename_fastq,
                        outfile_details=outfile_details,
                        add_alignment_details=options.add_alignment_details,
                        outfile_readmap=outfile_readmap,
                        detailed_count=options.detailed_count)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = iotools.pretty_percent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_bed:
        _write(outs, "alignments_masked", counter.alignments_masked,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_notmasked", counter.alignments_notmasked,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq or not is_stdin:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_mapped_supplementary",
               counter.total_read_has_supplementary,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.error_counts, counter.error_rate * 100.0))
    outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.insertion_counts, counter.insertion_rate * 100.0))
    outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" %
               (counter.deletion_counts, counter.deletion_rate * 100.0))
    outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" %
               (counter.mismatch_counts, counter.mismatch_rate * 100.0))
    outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.match_counts, counter.match_rate * 100.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.open_output_file("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.open_output_file("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.open_output_file("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.open_output_file("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    if details_df is not None:
        with E.open_output_file("summaries", "w") as outf:
            details_df.describe().transpose().to_csv(outf,
                                                     sep="\t",
                                                     index_label="metric")
        bins = numpy.arange(0, 1.01, 0.01)
        histogram_df = pandas.DataFrame.from_items([
            (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0])
            for x in details_df.columns
        ])

        histogram_df.index = numpy.arange(0, 1.0, 0.01)

        row_sums = histogram_df.sum(axis=1)
        histogram_df = histogram_df[row_sums != 0]

        with E.open_output_file("histogram", "w") as outf:
            histogram_df.to_csv(outf, sep="\t", index_label="bin")

    # write footer and output benchmark information.
    E.stop()
Exemple #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-e",
                        "--exons-file",
                        "--gtf-file",
                        dest="filename_exons",
                        type=str,
                        metavar="gtf",
                        help="gtf formatted file with non-overlapping exon "
                        "locations (required). ")

    parser.set_defaults(
        filename_exons=None,
        read_length=200,
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    exons = GTF.readAndIndex(
        GTF.iterator(iotools.open_file(args.filename_exons)))

    pysam_in = pysam.AlignmentFile("-", "rb")

    nspliced = 0
    nspliced_ignored = 0
    nspliced_nooverlap = 0
    nspliced_halfoverlap = 0
    nspliced_bothoverlap = 0
    nspliced_overrun = [0] * 2 * (args.read_length + 10)
    nspliced_exact = 0
    nspliced_inexact = 0
    nunspliced = 0
    nunspliced_overlap = 0
    nunspliced_ignored = 0
    nunspliced_nooverlap = 0
    nunspliced_overrun = [0] * (args.read_length + 10)
    overrun_offset = args.read_length + 10
    ninput = 0
    nunmapped = 0

    c = E.Counter()

    def _splice_overrun(start, end, overlap):
        '''return splicesite over/underrun.

        positive values: overrun
        negative values: underrun
        0: no over/underrun
        '''

        exon_start = min([x[0] for x in overlap])
        exon_end = max([x[1] for x in overlap])

        if start <= exon_start and end > exon_start:
            # overrun at start or match
            r = exon_start - start
        elif start < exon_end and end >= exon_end:
            # overrun at end or match
            r = end - exon_end
        else:
            # underrun - distance to closest exon boundary
            r = -min(start - exon_start, exon_end - end)

        return r

    for read in pysam_in:
        ninput += 1
        if read.is_unmapped:
            nunmapped += 1
            continue

        # check for BAM_CREF_SKIP code in cigar string
        cigar = read.cigar
        is_spliced = 3 in [x[0] for x in cigar]

        contig = pysam_in.getrname(read.tid)
        start = read.pos
        end = read.aend
        if is_spliced:
            # count both ends
            nspliced += 1

            if len(cigar) != 3:
                nspliced_ignored += 1
                continue

            start5, end5 = start, start + cigar[0][1]
            start3, end3 = end - cigar[2][1], end
            try:
                overlap3 = list(exons.get(contig, start3, end3))
                overlap5 = list(exons.get(contig, start5, end5))
            except KeyError:
                overlap3 = overlap5 = []

            ovl3 = len(overlap3)
            ovl5 = len(overlap5)
            o3 = o5 = None
            if not ovl3 and not ovl5:
                nspliced_nooverlap += 1
            elif ovl3 and not ovl5:
                nspliced_halfoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
            elif ovl5 and not ovl3:
                nspliced_halfoverlap += 1
                o5 = _splice_overrun(start5, end5, overlap5)
            else:
                # both overlap
                nspliced_bothoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
                o5 = _splice_overrun(start5, end5, overlap5)

            if o3 is not None:
                if o3 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o3)] += 1
            if o5 is not None:
                if o5 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o5)] += 1
        else:
            nunspliced += 1
            try:
                overlap = list(exons.get(contig, start, end))
            except KeyError:
                overlap = []

            if len(overlap) == 0:
                nunspliced_nooverlap += 1
            elif len(overlap) >= 1:
                nunspliced_overlap += 1
                # multiple overlap - merge exons (usually: small introns)
                exon_start = min([x[0] for x in overlap])
                exon_end = max([x[1] for x in overlap])
                ostart = max(0, exon_start - start)
                oend = max(0, end - exon_end)
                o = min(end, exon_end) - max(start, exon_start)
                overrun = ostart + oend
                nunspliced_overrun[overrun] += 1

    # output histograms
    outfile = E.open_output_file("overrun")
    outfile.write(
        "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n"
    )
    _nspliced_overrun = nspliced_overrun[overrun_offset:]
    _nspliced_underrun = nspliced_overrun[:overrun_offset + 1]
    _nspliced_underrun.reverse()
    for x, v in enumerate(
            zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)):
        outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v))))
    outfile.close()

    # output summary
    # convert to counter
    c.input = ninput
    c.unmapped = nunmapped
    c.mapped = ninput - nunmapped

    c.unspliced = nunspliced
    c.unspliced_nooverlap = nunspliced_nooverlap
    c.unspliced_nooverrun = nunspliced_overrun[0]
    c.unspliced_overlap = nunspliced_overlap
    c.unspliced_overrun = sum(nunspliced_overrun[1:])

    c.spliced = nspliced
    c.spliced_nooverlap = nspliced_nooverlap
    c.spliced_halfoverlap = nspliced_halfoverlap
    c.spliced_bothoverlap = nspliced_bothoverlap
    c.spliced_exact = nspliced_exact
    c.spliced_inexact = nspliced_inexact
    c.spliced_ignored = nspliced_ignored
    c.spliced_underrun = sum(_nspliced_underrun[1:])
    c.spliced_overrun = sum(_nspliced_overrun[1:])

    outfile = args.stdout
    outfile.write("category\tcounts\n")
    for k, v in sorted(c.items()):
        outfile.write("%s\t%i\n" % (k, v))

    # write footer and output benchmark information.
    E.stop()