Example #1
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter to separate columns [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=["row-describe", "column-describe"],
                      help="additional methods to apply [%default]")

    parser.set_defaults(
        delimiter="\t",
        methods=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.methods:
        options.methods = ["summary"]

    table = pandas.read_csv(options.stdin, options.delimiter)

    options.stdout.write("metric\tcount\tpercent\tinfo\n")

    for method in options.methods:
        label = re.sub("-", "_", method)
        if method == "summary":
            for category, count, denominator, info in compute_table_summary(
                    table):
                options.stdout.write("\t".join(
                    map(str, (category, count,
                              iotools.pretty_percent(count, denominator,
                                                     na=""), info))) + "\n")
        elif method == "column-describe":
            df = table.describe().T.stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")
        elif method == "row-describe":
            df = table.T.describe().stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")

    E.stop()
Example #2
0
def printValues(contig, max_size, window_size, values, options):
    """output values."""

    outfile = E.open_output_file(contig, "w")

    outfile.write("abs_pos\trel_pos")

    for feature in options.features:
        outfile.write("\tabs_%s\trel_%s" % (feature, feature))
    outfile.write("\n")

    max_vv = []

    for f in range(len(options.features)):
        max_vv.append(float(max([x[f] for x in values])))

    bin = 0
    for vv in values:
        outfile.write("%i\t" % bin)
        outfile.write(options.value_format % (float(bin) / max_size))

        for x in range(len(options.features)):
            outfile.write("\t%i\t%s" % (vv[x], options.value_format %
                                        (vv[x] / max_vv[x])))
        outfile.write("\n")
        bin += window_size

    outfile.close()
Example #3
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes),
            sum(contig_sizes.values()) * array.array("B").itemsize))
    # AString.AString( "a").itemsize ))

    for contig, size in list(contig_sizes.items()):
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        # annotations[contig] = array.array("", default_code * size)
        # Go to list for py3 compatibility, patch
        annotations[contig] = [default_code] * size

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.open_output_file("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError as msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig], intervals, is_positive, code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig], UTR5, is_positive, "u")

            addIntrons(annotations[contig], UTR5, is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig], UTR3, is_positive, "v")

            addIntrons(annotations[contig], UTR3, is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"], is_positive)

            # add introns between CDS
            addIntrons(annotations[contig], cds, is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (
                    contig,
                    out_positive,
                    end,
                    start,
                    c.frame,
                    c.gene_id,
                    c.transcript_id,
                ))
                end = ender(c)

    E.info("finished reading genes: %s" % str(counter))

    outfile_junctions.close()

    E.info("started counting")
    outfile = E.open_output_file("counts")
    outputCounts(outfile, annotations)
    outfile.close()

    E.info("started output")
    for k in sorted(annotations.keys()):
        # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring()))
        options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()
Example #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        "--mask-gff-file",
        dest="filename_bed",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="ignore_masked_reads",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option("--output-readmap",
                      dest="output_readmap",
                      action="store_true",
                      help="output map between read name and "
                      "md5/base64 encoded short name[%default]")

    parser.add_option(
        "--add-alignment-details",
        dest="add_alignment_details",
        action="store_true",
        help=
        "add alignment details to per-read details. Implies --output-details "
        "[%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.add_option(
        "--basic-counts",
        dest="detailed_count",
        action="store_false",
        help="perform basic counting and do not compute per read stats. "
        "This is more memory efficient and faster stats computation, "
        "but only a summary counts table is output [%default]")

    parser.set_defaults(
        filename_bed=None,
        ignore_masked_reads=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        detailed_count=True,
        output_details=False,
        output_readmap=False,
        add_alignment_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.filename_bed:
        bed_mask = GTF.readAndIndex(
            GTF.iterator(iotools.open_file(options.filename_bed)))
    else:
        bed_mask = None

    if options.add_alignment_details:
        options.output_details = True

    is_stdin = True
    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
        if args[0] != "-":
            is_stdin = False
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")
        if options.stdin != "-":
            is_stdin = False

    if options.output_details:
        outfile_details = E.open_output_file("details", "w")
    else:
        outfile_details = None

    if options.output_readmap:
        outfile_readmap = E.open_output_file("readmap", "w")
    else:
        outfile_readmap = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \
        bam2stats_count(pysam_in,
                        bed_mask=bed_mask,
                        ignore_masked_reads=options.ignore_masked_reads,
                        is_stdin=is_stdin,
                        filename_fastq=options.filename_fastq,
                        outfile_details=outfile_details,
                        add_alignment_details=options.add_alignment_details,
                        outfile_readmap=outfile_readmap,
                        detailed_count=options.detailed_count)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = iotools.pretty_percent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_bed:
        _write(outs, "alignments_masked", counter.alignments_masked,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_notmasked", counter.alignments_notmasked,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq or not is_stdin:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_mapped_supplementary",
               counter.total_read_has_supplementary,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.error_counts, counter.error_rate * 100.0))
    outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.insertion_counts, counter.insertion_rate * 100.0))
    outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" %
               (counter.deletion_counts, counter.deletion_rate * 100.0))
    outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" %
               (counter.mismatch_counts, counter.mismatch_rate * 100.0))
    outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.match_counts, counter.match_rate * 100.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.open_output_file("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.open_output_file("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.open_output_file("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.open_output_file("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    if details_df is not None:
        with E.open_output_file("summaries", "w") as outf:
            details_df.describe().transpose().to_csv(outf,
                                                     sep="\t",
                                                     index_label="metric")
        bins = numpy.arange(0, 1.01, 0.01)
        histogram_df = pandas.DataFrame.from_items([
            (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0])
            for x in details_df.columns
        ])

        histogram_df.index = numpy.arange(0, 1.0, 0.01)

        row_sums = histogram_df.sum(axis=1)
        histogram_df = histogram_df[row_sums != 0]

        with E.open_output_file("histogram", "w") as outf:
            histogram_df.to_csv(outf, sep="\t", index_label="bin")

    # write footer and output benchmark information.
    E.stop()
Example #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--no-empty-bins",
                      dest="no_empty_bins",
                      action="store_true",
                      help="do not display empty bins.")

    parser.add_option("--with-empty-bins",
                      dest="no_empty_bins",
                      action="store_false",
                      help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins",
                      dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format",
                      dest="format",
                      type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section",
                      dest="output_section",
                      type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.open_output_file("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.open_output_file("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.open_output_file("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.open_output_file("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.open_output_file("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.open_output_file("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput, len(values_between), len(values_within), noverlaps))

    E.stop()
Example #7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-e",
                        "--exons-file",
                        "--gtf-file",
                        dest="filename_exons",
                        type=str,
                        metavar="gtf",
                        help="gtf formatted file with non-overlapping exon "
                        "locations (required). ")

    parser.set_defaults(
        filename_exons=None,
        read_length=200,
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    exons = GTF.readAndIndex(
        GTF.iterator(iotools.open_file(args.filename_exons)))

    pysam_in = pysam.AlignmentFile("-", "rb")

    nspliced = 0
    nspliced_ignored = 0
    nspliced_nooverlap = 0
    nspliced_halfoverlap = 0
    nspliced_bothoverlap = 0
    nspliced_overrun = [0] * 2 * (args.read_length + 10)
    nspliced_exact = 0
    nspliced_inexact = 0
    nunspliced = 0
    nunspliced_overlap = 0
    nunspliced_ignored = 0
    nunspliced_nooverlap = 0
    nunspliced_overrun = [0] * (args.read_length + 10)
    overrun_offset = args.read_length + 10
    ninput = 0
    nunmapped = 0

    c = E.Counter()

    def _splice_overrun(start, end, overlap):
        '''return splicesite over/underrun.

        positive values: overrun
        negative values: underrun
        0: no over/underrun
        '''

        exon_start = min([x[0] for x in overlap])
        exon_end = max([x[1] for x in overlap])

        if start <= exon_start and end > exon_start:
            # overrun at start or match
            r = exon_start - start
        elif start < exon_end and end >= exon_end:
            # overrun at end or match
            r = end - exon_end
        else:
            # underrun - distance to closest exon boundary
            r = -min(start - exon_start, exon_end - end)

        return r

    for read in pysam_in:
        ninput += 1
        if read.is_unmapped:
            nunmapped += 1
            continue

        # check for BAM_CREF_SKIP code in cigar string
        cigar = read.cigar
        is_spliced = 3 in [x[0] for x in cigar]

        contig = pysam_in.getrname(read.tid)
        start = read.pos
        end = read.aend
        if is_spliced:
            # count both ends
            nspliced += 1

            if len(cigar) != 3:
                nspliced_ignored += 1
                continue

            start5, end5 = start, start + cigar[0][1]
            start3, end3 = end - cigar[2][1], end
            try:
                overlap3 = list(exons.get(contig, start3, end3))
                overlap5 = list(exons.get(contig, start5, end5))
            except KeyError:
                overlap3 = overlap5 = []

            ovl3 = len(overlap3)
            ovl5 = len(overlap5)
            o3 = o5 = None
            if not ovl3 and not ovl5:
                nspliced_nooverlap += 1
            elif ovl3 and not ovl5:
                nspliced_halfoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
            elif ovl5 and not ovl3:
                nspliced_halfoverlap += 1
                o5 = _splice_overrun(start5, end5, overlap5)
            else:
                # both overlap
                nspliced_bothoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
                o5 = _splice_overrun(start5, end5, overlap5)

            if o3 is not None:
                if o3 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o3)] += 1
            if o5 is not None:
                if o5 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o5)] += 1
        else:
            nunspliced += 1
            try:
                overlap = list(exons.get(contig, start, end))
            except KeyError:
                overlap = []

            if len(overlap) == 0:
                nunspliced_nooverlap += 1
            elif len(overlap) >= 1:
                nunspliced_overlap += 1
                # multiple overlap - merge exons (usually: small introns)
                exon_start = min([x[0] for x in overlap])
                exon_end = max([x[1] for x in overlap])
                ostart = max(0, exon_start - start)
                oend = max(0, end - exon_end)
                o = min(end, exon_end) - max(start, exon_start)
                overrun = ostart + oend
                nunspliced_overrun[overrun] += 1

    # output histograms
    outfile = E.open_output_file("overrun")
    outfile.write(
        "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n"
    )
    _nspliced_overrun = nspliced_overrun[overrun_offset:]
    _nspliced_underrun = nspliced_overrun[:overrun_offset + 1]
    _nspliced_underrun.reverse()
    for x, v in enumerate(
            zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)):
        outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v))))
    outfile.close()

    # output summary
    # convert to counter
    c.input = ninput
    c.unmapped = nunmapped
    c.mapped = ninput - nunmapped

    c.unspliced = nunspliced
    c.unspliced_nooverlap = nunspliced_nooverlap
    c.unspliced_nooverrun = nunspliced_overrun[0]
    c.unspliced_overlap = nunspliced_overlap
    c.unspliced_overrun = sum(nunspliced_overrun[1:])

    c.spliced = nspliced
    c.spliced_nooverlap = nspliced_nooverlap
    c.spliced_halfoverlap = nspliced_halfoverlap
    c.spliced_bothoverlap = nspliced_bothoverlap
    c.spliced_exact = nspliced_exact
    c.spliced_inexact = nspliced_inexact
    c.spliced_ignored = nspliced_ignored
    c.spliced_underrun = sum(_nspliced_underrun[1:])
    c.spliced_overrun = sum(_nspliced_overrun[1:])

    outfile = args.stdout
    outfile.write("category\tcounts\n")
    for k, v in sorted(c.items()):
        outfile.write("%s\t%i\n" % (k, v))

    # write footer and output benchmark information.
    E.stop()
Example #8
0
def writeMatricesForSortOrder(features_per_interval, bins, foreground_track,
                              control_tracks, shifted, sort_order):
    '''output one or more matrices for each sort sorder.

    For each sort order output the forerground. If there
    are additional controls and shifted section, output
    these as well

    The files will named:
    matrix_<track>_<sortorder>

    '''
    if "name" in features_per_interval[0].interval:
        names = [x.interval.name for x in features_per_interval]
    else:
        names = list(map(str, list(range(1, len(features_per_interval) + 1))))

    bins = ["%i" % x for x in bins]
    sort_order = re.sub("-", "_", sort_order)

    # write foreground
    iotools.write_matrix(E.open_output_file("matrix_%s_%s.gz" %
                                            (foreground_track, sort_order)),
                         [x.foreground.counts for x in features_per_interval],
                         row_headers=names,
                         col_headers=bins,
                         row_header="name")

    # write controls
    for idx, track in enumerate(control_tracks):
        iotools.write_matrix(
            E.open_output_file("matrix_%s_%s.gz" % (track, sort_order)),
            [x.controls[idx].counts for x in features_per_interval],
            row_headers=names,
            col_headers=bins,
            row_header="name")

    # write shifted matrix
    if shifted:
        iotools.write_matrix(E.open_output_file("matrix_shift_%s.gz" %
                                                (sort_order)),
                             [x.shifted.counts for x in features_per_interval],
                             row_headers=names,
                             col_headers=bins,
                             row_header="name")

    # output a combined matrix
    if len(control_tracks) > 0 or shifted:
        rows = []
        for row in features_per_interval:
            l = [row.foreground.counts]
            l.extend(
                [row.controls[x].counts for x in range(len(control_tracks))])
            if shifted:
                l.append(row.shifted.counts)
            rows.append(numpy.concatenate(l))

        n = 1 + len(control_tracks)
        if shifted:
            n += 1

        # make column names unique and make sure they can be sorted
        # lexicographically
        all_bins = []
        for x in range(n):
            all_bins.extend(["%i:%s" % (x, b) for b in bins])

        iotools.write_matrix(E.open_output_file("matrix_sidebyside_%s.gz" %
                                                (sort_order)),
                             rows,
                             row_headers=names,
                             col_headers=all_bins,
                             row_header="name")
Example #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=('reconcile', 'filter-by-sequence'),
                        help="method to apply.")

    parser.add_argument(
        "-c",
        "--chop-identifier",
        dest="chop",
        action="store_true",
        help="whether or not to trim last character of the  "
        "sequence name. For example sometimes ids in the first "
        "file in the pair will end with \1 and the second "
        "with \2. If --chop-identifier is not specified "
        "then the results will be wrong.")

    parser.add_argument("-u",
                        "--unpaired",
                        dest="unpaired",
                        action="store_true",
                        help="whether or not to write out unpaired reads "
                        "to a separate file")

    parser.add_argument("--id-pattern-1",
                        dest="id_pattern_1",
                        help="If specified will use the first group from the"
                        "pattern to determine the ID for the first read")

    parser.add_argument("--id-pattern-2",
                        dest="id_pattern_2",
                        help="As above but for read 2")

    parser.add_argument("--input-filename-fasta",
                        dest="input_filename_fasta",
                        type=str,
                        help="input filename of FASTA formatted sequence "
                        "for method 'filter-by-sequence'.")

    parser.add_argument("--filtering-kmer-size",
                        dest="filtering_kmer_size",
                        type=int,
                        help="kmer size for method 'filter-by-sequence'.")

    parser.add_argument("--filtering-min-kmer-matches",
                        dest="filtering_min_kmer_matches",
                        type=int,
                        help="minimum number of matches 'filter-by-sequence'.")

    parser.set_defaults(method="reconcile",
                        chop=False,
                        unpaired=False,
                        input_filename_fasta=None,
                        filtering_kmer_size=10,
                        filtering_min_kmer_matches=20)

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = unknown
    counter = E.Counter()

    if args.id_pattern_1:
        id1_getter = PatternGetter(args.id_pattern_1)
    else:
        id1_getter = plain_getter

    if args.id_pattern_2:
        id2_getter = PatternGetter(args.id_pattern_2)
    else:
        id2_getter = plain_getter

    if args.method == "reconcile":

        # IMS: switching to no store second set of read names and only use
        # lazily. Since generators don't have a size must keep track
        id_lengths = {fn1: 0, fn2: 0}

        def getIds(infile, id_getter=plain_getter):
            '''return ids in infile.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                # decide if to chop read number off
                id_lengths[infile.name] += 1
                if args.chop:
                    yield r[:-1]
                else:
                    yield r

        def write(outfile,
                  infile,
                  take,
                  unpaired_file=None,
                  id_getter=plain_getter):
            '''filter fastq files with ids in take.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                if args.chop:
                    r = r[:-1]
                if r not in take:
                    if unpaired_file is None:
                        continue
                    else:
                        unpaired_file.write("\n".join(l) + "\n")
                else:
                    outfile.write("\n".join(l) + "\n")

        E.info("reading first in pair")
        inf1 = iotools.open_file(fn1)
        ids1 = set(getIds(inf1, id1_getter))

        E.info("reading second in pair")
        inf2 = iotools.open_file(fn2)
        # IMS: No longer keep as a set, but lazily evaluate into intersection
        # leads to large memory saving for large inf2, particularly if
        # inf1 is small.
        ids2 = getIds(inf2, id2_getter)
        take = ids1.intersection(ids2)

        E.info("first pair: %i reads, second pair: %i reads, "
               "shared: %i reads" %
               (id_lengths[fn1], id_lengths[fn2], len(take)))

        if args.unpaired:
            unpaired_filename = E.open_output_file("unpaired.fastq.gz", "w")
        else:
            unpaired_filename = None

        with E.open_output_file("1", "w") as outf:
            inf = iotools.open_file(fn1)
            E.info("writing first in pair")
            write(outf, inf, take, unpaired_filename, id1_getter)

        with E.open_output_file("2", "w") as outf:
            inf = iotools.open_file(fn2)
            E.info("writing second in pair")
            write(outf, inf, take, unpaired_filename, id2_getter)

        counter.output = len(take)

        if args.unpaired:
            unpaired_filename.close()

    elif args.method == "filter-by-sequence":

        with pysam.FastxFile(args.input_filename_fasta) as inf:
            for record in inf:
                query_sequence = record.sequence
                break

        with pysam.FastxFile(fn1, persist=False) as inf1, \
                pysam.FastxFile(fn2, persist=False) as inf2, \
                E.open_output_file("matched.fastq.1.gz", "w") as outf_matched1, \
                E.open_output_file("matched.fastq.2.gz", "w") as outf_matched2, \
                E.open_output_file("unmatched.fastq.1.gz", "w") as outf_unmatched1, \
                E.open_output_file("unmatched.fastq.2.gz", "w") as outf_unmatched2:
            counter = fastqtools.filter_by_sequence(
                query_sequence,
                inf1,
                inf2,
                outf_matched1,
                outf_matched2,
                outf_unmatched1,
                outf_unmatched2,
                kmer_size=args.filtering_kmer_size,
                min_kmer_matches=args.filtering_min_kmer_matches)
        args.stdout.write("\t".join(("input", "matched", "unmatched",
                                     "percent_matched")) + "\n")

        args.stdout.write("\t".join(
            map(str, (counter.input, counter.matched, counter.unmatched,
                      100.0 * counter.matched / counter.input))) + "\n")

    E.info(str(counter))
    E.stop()
Example #10
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--reference-bed-file",
                      dest="reference_bed_file",
                      type="string",
                      help="reference bed file "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("lvc-comparison", ),
                      help="methods to apply [%default]")

    parser.set_defaults(method="lvc-comparison",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        size_bins=(1000, 10000, 100000),
                        output_sets=True,
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    reference_set = collections.defaultdict(quicksect.IntervalTree)

    E.info("reading reference bed file from {}".format(
        options.reference_bed_file))
    with iotools.open_file(options.reference_bed_file) as inf:
        for record in pysam.tabix_iterator(inf, pysam.asBed()):
            mm = reference_set[record.contig]
            mm.add(record.start, record.end)
    E.info("read reference intervals on {} contigs: {}".format(
        len(list(reference_set.keys())), ",".join(list(reference_set.keys()))))

    if options.output_sets:
        output_tp = E.open_output_file("tp")
        output_fp = E.open_output_file("fp")
        output_fn = E.open_output_file("fn")
    else:
        output_tp = None
        output_fp = None
        output_fn = None

    if options.method == "lvc-comparison":
        c = E.Counter()

        found = set()
        counts = {}
        names = set()
        nsize_bins = len(options.size_bins)
        for bin in range(len(options.size_bins) + 1):
            counts[bin] = dict([(x, collections.defaultdict(int))
                                for x in ("tp", "fn", "fp", "test", "truth")])

        for record in pysam.tabix_iterator(options.stdin, pysam.asBed()):
            if record.contig not in reference_set:
                c.ignored_no_contig += 1
                continue

            c.test += 1
            matches = reference_set[record.contig].search(
                record.start, record.end)
            size = record.end - record.start
            bin = get_size_bin(size, options.size_bins)

            if len(matches) == 0:
                c.fp += 1
                status = "fp"
                if output_fp:
                    output_fp.write(str(record) + "\n")
            elif len(matches) >= 1:
                c.tp += 1
                status = "tp"
                if output_tp:
                    output_tp.write(str(record) + "\n")
                # todo: overlap criteria

                # record found
                for match in matches:
                    found.add((record.contig, match.start, match.end))

            name = record.name.split(",")[0]
            names.add(name)
            counts[bin]["test"][name] += 1
            counts[bin][status][name] += 1

        outf = options.stdout

        with iotools.open_file(options.reference_bed_file) as inf:
            for record in pysam.tabix_iterator(inf, pysam.asBed()):
                c.truth += 1
                bin = get_size_bin(record.end - record.start,
                                   options.size_bins)
                counts[bin]["truth"]["all"] += 1

                key = (record.contig, record.start, record.end)
                if key not in found:
                    c.fn += 1
                    counts[bin]["fn"]["all"] += 1

        outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth",
                              "fn")) + "\n")

        for name in sorted(names):
            for bin in range(len(options.size_bins) + 1):
                if bin == len(options.size_bins):
                    size_bin = ">={}".format(options.size_bins[-1])
                else:
                    size_bin = "<{}".format(options.size_bins[bin])
                outf.write("\t".join(
                    map(str, (
                        name,
                        size_bin,
                        counts[bin]["test"][name],
                        counts[bin]["tp"][name],
                        counts[bin]["fp"][name],
                        counts[bin]["truth"]["all"],
                        counts[bin]["fn"]["all"],
                    ))) + "\n")

    E.info(str(c))
    E.stop()
Example #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--input-bed-file",
        dest="input_bed_file",
        type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option(
        "-m",
        "--merge-intervals",
        dest="merge_intervals",
        action="store_true",
        help="merge intervals in bed file. Useful if you have a site bed-file "
        "[%default]")

    parser.add_option("-f",
                      "--reference-fasta-file",
                      dest="reference_fasta_file",
                      help="reference genomic sequence in fasta format. "
                      "[%default]")

    parser.add_option(
        "-c",
        "--barcode-fasta-file",
        dest="barcode_fasta_file",
        help="barcode sequence in fasta format. Variable positions "
        "should be marked by N "
        "[%default]")

    parser.set_defaults(
        reference_fasta_file=None,
        barcode_fasta_file=None,
        merge_intervals=False,
        input_bed_file=None,
        anchor=5,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.stdin != sys.stdin:
        bamfile = options.stdin.name
    elif args:
        if len(args) > 1:
            raise ValueError("multiple bam files provided in arguments")
        bamfile = args[0]
    else:
        bamfile = "-"

    if options.barcode_fasta_file:
        with pysam.FastxFile(options.barcode_fasta_file) as inf:
            barcode_sequence = next(inf).sequence
    else:
        barcode_sequence = None

    if not os.path.exists(options.reference_fasta_file):
        raise OSError("reference fasta file {} does not exist".format(
            options.reference_fasta_file))

    if not os.path.exists(options.input_bed_file):
        raise OSError("input bed file {} does not exist".format(
            options.input_bed_file))

    bed_in = pysam.TabixFile(options.input_bed_file)
    pysam_in = pysam.AlignmentFile(bamfile)
    anchor = options.anchor

    for region_idx, vals in enumerate(
            iterate_bed(bed_in, options.merge_intervals)):

        if region_idx > 0:
            raise NotImplementedError(
                "output for multiple regions not yet implemented")

        contig, region_start, region_end = vals
        upstream_anchors, downstream_anchors = [], []
        counter = E.Counter()

        unaligned_fn = E.get_output_file(
            "unaligned_{}.fasta".format(region_idx))
        with iotools.open_file(unaligned_fn, "w") as outf:
            for read in pysam_in.fetch(contig, region_start, region_end):
                counter.overlapping_reads += 1
                try:
                    pairs = read.get_aligned_pairs(with_seq=True)
                except ValueError:
                    counter.no_md_tag += 1
                    continue

                map_ref2read_pos = dict(
                    (x[1], x[0]) for x in pairs if x[0] is not None)
                map_ref2ref_base = dict(
                    (x[1], x[2]) for x in pairs if x[0] is not None)

                upstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_start - anchor, region_start))

                downstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_end, region_end + anchor))

                # check if at least one anchor is aligned
                upstream_matches = sum([x.isupper() for x in upstream_anchor])
                downstream_matches = sum(
                    [x.isupper() for x in downstream_anchor])

                if upstream_matches < anchor and downstream_matches < anchor:
                    counter.no_anchor += 1
                    continue
                seq = read.query_alignment_sequence

                # collect full length anchors
                upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start
                downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor

                if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos:
                    upstream_anchors.append(
                        seq[map_ref2read_pos[upstream_anchor_start]:
                            map_ref2read_pos[upstream_anchor_end]])
                if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos:
                    downstream_anchors.append(
                        seq[map_ref2read_pos[downstream_anchor_start]:
                            map_ref2read_pos[downstream_anchor_end]])

                # get region to align
                read_start = min(
                    (map_ref2read_pos.get(x, len(seq))
                     for x in range(region_start - anchor, region_start)))
                if read_start == len(seq):
                    read_start = 0
                read_end = max(
                    (map_ref2read_pos.get(x, 0) + 1
                     for x in range(region_end, region_end + anchor)))
                if read_end == 1:
                    read_end = len(seq)
                counter.collected_reads += 1
                outf.write(">{}/{}-{}\n{}\n".format(read.query_name,
                                                    read_start, read_end,
                                                    seq[read_start:read_end]))
        counter.downstream_anchors = len(downstream_anchors)
        counter.upstream_anchors = len(upstream_anchors)

        E.info(counter)

        if counter.overlapping_reads == 0:
            E.warn("no sequences overlapping region")
            continue

        if counter.downstream_anchors == 0 or counter.upstream_anchors == 0:
            E.warn("at least one anchor undefined")
            continue

        if counter.collected_reads == 1:
            E.warn("only single sequence, multiple aligment skipped")
            with iotools.open_file(unaligned_fn) as inf:
                stdout = inf.read()
        else:
            # G-INS-i -> global alignment algorithm
            E.info("starting mafft multiple alignment")
            stdout = E.run(
                "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}"
                .format(unaligned_fn),
                return_stdout=True)

        aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx))
        with iotools.open_file(aligned_fn, "w") as outf:
            outf.write(stdout)

        mali = stdout.splitlines()
        identifiers = [mali[x] for x in range(0, len(mali), 2)]
        sequences = [mali[x].upper() for x in range(1, len(mali), 2)]
        consensus = get_consensus(sequences)

        E.info("after alignment: consensus={}".format(consensus))

        # gap filtering -> remove highly gappy columns
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after anchor trimming: consensus={}".format(consensus))

        take = [idx for idx, x in enumerate(consensus) if x != "-"]
        sequences = ["".join([s[x] for x in take]) for s in sequences]
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after gap filtering: consensus={}".format(consensus))

        # get anchor consensus and chop it off
        consensus = get_consensus(sequences, ignore_gaps=True)
        upstream_anchor = get_anchor_consensus(upstream_anchors)
        downstream_anchor = get_anchor_consensus(downstream_anchors)

        upstream_anchor_start = consensus.find(upstream_anchor)
        downstream_anchor_start = consensus.rfind(downstream_anchor)

        E.info(
            "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}"
            .format(consensus, upstream_anchor, downstream_anchor,
                    upstream_anchor_start, downstream_anchor_start))

        if upstream_anchor_start < 0 or downstream_anchor_start < 0:
            E.warn("can't locate anchor, no output produced")
            continue

        upstream_anchor_end = upstream_anchor_start + len(upstream_anchor)
        if upstream_anchor_end >= downstream_anchor_start:
            E.warn("anchor not in correct order, no output produced")
            continue

        sequences = [
            x[upstream_anchor_end:downstream_anchor_start] for x in sequences
        ]
        consensus = get_consensus(sequences)

        E.info("after anchor trimming: consensus={}".format(consensus))

        truncated_fn = E.get_output_file(
            "aligned_truncated_{}.fasta".format(region_idx))
        with iotools.open_file(truncated_fn, "w") as outf:
            outf.write("\n".join("{}\n{}\n".format(x, y)
                                 for x, y in zip(identifiers, sequences)))

        positions = list(zip(*sequences))
        bases = ["A", "C", "G", "T"]
        df = pandas.DataFrame([collections.Counter(x)
                               for x in positions]).fillna(0)
        for missing_base in [x for x in bases if x not in df.columns]:
            df[missing_base] = 0
        df["gapped_depth"] = df.sum(axis=1)
        df["depth"] = df[bases].sum(axis=1)
        df["consensus"] = df[bases].idxmax(axis=1)
        df["consensus_counts"] = df.lookup(df.index, df.consensus)
        df["consensus_support"] = df.consensus_counts / df.depth
        df["offconsensus_counts"] = df.depth - df.consensus_counts
        df.loc[df.consensus_counts == 0, "consensus"] = "N"
        df["region_id"] = region_idx

        # replace "gap" consensus positions with + character
        alignment = global_align(re.sub("-", "+", consensus), barcode_sequence)
        E.info("alignment: consensus {}".format(alignment[0]))
        E.info("alignment: barcode   {}".format(alignment[1]))

        barcode_idx = 0
        deleted_barcode_bases = []
        rows = []
        for c, b in zip(*alignment):
            if c == "-":
                deleted_barcode_bases.append(barcode_idx)
                barcode_idx += 1
            elif b == "N":
                rows.append((barcode_idx, "variable"))
                barcode_idx += 1
            elif b == "-":
                rows.append(("", "insertion"))
            elif b == c:
                rows.append((barcode_idx, "fixed-match"))
                barcode_idx += 1
            else:
                rows.append((barcode_idx, "fixed-mismatch"))
                barcode_idx += 1

        alignment_df = pandas.DataFrame.from_records(
            rows, columns=["barcode_pos", "barcode_class"])

        assert len(alignment_df) == len(df)
        df = pandas.concat([df, alignment_df], axis=1)
        with E.open_output_file("pileup") as outf:
            df.to_csv(outf, sep="\t", index=True, index_label="position")

        observed_barcode_sequence = "".join(
            df[df.barcode_class == "variable"].consensus)
        headers = df.consensus_support.describe().index
        eval_df = df.loc[df.barcode_class.isin(
            ("variable", "fixed-match", "fixed-mismatch")), ]
        median_consensus_depth = eval_df.consensus_counts.median()
        # zero stuff out if depth is low
        if median_consensus_depth <= 2:
            deleted_barcode_bases = []

        outf = options.stdout
        # modules to recover partial bar-codes
        outf.write("\t".join(
            map(str, [
                "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases"
            ] + ["support_{}".format(x)
                 for x in headers] + ["counts_{}".format(x) for x in headers] +
                ["offcounts_{}".format(x) for x in headers])) + "\n")

        outf.write("\t".join(
            map(str, [
                observed_barcode_sequence,
                len(deleted_barcode_bases), ",".join(
                    map(str, deleted_barcode_bases))
            ] + eval_df.consensus_support.describe().tolist() +
                eval_df.consensus_counts.describe().tolist() +
                eval_df.offconsensus_counts.describe().tolist())) + "\n")

    E.stop()