Esempio n. 1
0
def iterator_psl_intervals(options):
    """iterate over psl file yield an entry together with overlapping entries.

    returns tuples of (match, list(query_intervals), list(target_intervals))
    """

    if options.filename_filter_query:
        intervals_query = readIntervals(
            IOTools.openFile(options.filename_filter_query, "r"), options)
    else:
        intervals_query = None

    if options.filename_filter_target:
        intervals_target = readIntervals(
            IOTools.openFile(options.filename_filter_target, "r"), options)
    else:
        intervals_target = None

    iterator = Blat.BlatIterator(options.stdin)

    ninput = 0

    while 1:

        match = iterator.next()
        if not match: break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 1 and ninput % options.report_step == 0:
            options.stdlog.write("# progress: ninput=%i\n" % (ninput))
            options.stdlog.flush()

        qx, tx = None, None
        if intervals_query:
            try:
                qx = list(
                    intervals_query.get(match.mQueryId, match.mQueryFrom,
                                        match.mQueryTo))
            except KeyError:
                qx = []

        if intervals_target:
            try:
                tx = list(
                    intervals_target.get(match.mSbjctId, match.mSbjctFrom,
                                         match.mSbjctTo))
            except KeyError:
                tx = []

        if options.loglevel >= 2:
            options.stdlog.write(
                "###################################################\n")
            options.stdlog.write("# testing match %s\n" % (str(match)))
            options.stdlog.write(
                "###################################################\n")

        yield match, qx, tx
Esempio n. 2
0
def pslAddSequence(query_fasta, sbjct_fasta, options):

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        new = Blat.MatchPSLX()
        new.fromPSL(match,
                    query_fasta.getSequence(
                        match.mQueryId, "+", match.mQueryFrom, match.mQueryTo),
                    sbjct_fasta.getSequence(
                        match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo))

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 3
0
def pslComplement(query_fasta, target_fasta, options):
    """complenment psl entries.
    """

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    border = options.complement_border
    min_length = options.complement_min_length

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mNBlocks <= 1:
            nskipped += 1
            continue

        pairs = []
        for qstart, tstart, size in match.getBlocks():

            qend = qstart + size - border
            qstart += border

            if qend - qstart < options.complement_min_length:
                continue

            tend = tstart + size - border
            tstart += border

            if tend - tstart < options.complement_min_length:
                continue

            query_sequence = query_fasta.getSequence(match.mQueryId,
                                                     match.strand, qstart,
                                                     qend)
            sbjct_sequence = sbjct_fasta.getSequence(match.mSbjctId, "+",
                                                     tstart, tend)

        ndiscarded += 1

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 4
0
def pslComplementQuery(options):
    """complement psl entries.

    Fill the regions from a second psl file.
    """

    Iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    border = options.complement_border
    min_length = options.complement_min_length

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mNBlocks <= 1:
            nskipped += 1
            continue

        pairs = []
        for qstart, tstart, size in match.getBlocks():

            qend = qstart + size - border
            qstart += border

            if qend - qstart < options.complement_min_length:
                continue

            tend = tstart + size - border
            tstart += border

            if tend - tstart < options.complement_min_length:
                continue

        ndiscarded += 1

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 5
0
def chunk_iterator_psl_overlap(infile, args, prefix, use_header=False):
    """iterate over overlapping entries in a psl file."""

    iterator = Blat.BlatIterator(sys.stdin)

    processed_contigs = set()

    merge_distance = args[0]
    last_sbjct_id = None
    sbjct_end = 0
    outfile = None
    filename = None
    while 1:

        match = next(iterator)

        if match is None:
            break

        if match.mSbjctId != last_sbjct_id or \
           match.mSbjctFrom >= (sbjct_end + merge_distance):
            if last_sbjct_id:
                outfile.close()
                yield filename

            if last_sbjct_id != match.mSbjctId and \
               match.mSbjctId in processed_contigs:
                raise ValueError("input not sorted correctly (contig,start): "
                                 "already encountered %s\n%s" %
                                 (match.mSbjctId, str(match)))

            last_sbjct_id = match.mSbjctId
            processed_contigs.add(last_sbjct_id)

            sbjct_start = match.mSbjctFrom
            sbjct_end = match.mSbjctTo

        if match.mSbjctFrom < sbjct_start:
            raise ValueError("input not sorted correctly (contig,start): "
                             "%i < %i\n%s" %
                             (match.mSbjctFrom, sbjct_start, str(match)))

        sbjct_end = max(match.mSbjctTo, sbjct_end)
        outfile.write(str(match) + "\n")

    if outfile:
        outfile.close()
        yield filename
Esempio n. 6
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig"),
                      help="output format [default=%default]")

    parser.set_defaults(genome_file=None,
                        typecode=numpy.int16,
                        output_filename=None,
                        output_format="wiggle",
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    typecode = options.typecode

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        counts = {}
        contig_sizes = fasta.getContigSizes(with_synonyms=False)
        E.info("allocating memory for %i contigs and %i bytes" %
               (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize))
        for contig, size in contig_sizes.items():
            E.debug("allocating %s: %i bases" % (contig, size))
            counts[contig] = numpy.zeros(size, typecode)

        E.info("allocated memory for %i contigs" % len(fasta))

    else:
        fasta = None
        contig_sizes = {}

    if options.output_format in ("bigwig", "bigbed"):

        if not options.genome_file:
            raise ValueError(
                "please supply genome file for bigwig/bigbed computation.")

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = open(tmpfile_sizes, "w")
        for contig, size in contig_sizes.items():
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = open(tmpfile_wig, "w")

    else:
        outfile = options.stdout

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, ncontigs, nskipped = 0, 0, 0

    E.info("started counting")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        contig = match.mSbjctId

        for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes):
            counts[contig][start:start + length] += 1

    E.info("finished counting")

    if options.output_format in ("wig", "bigwig"):
        E.info("starting wig output")

        for contig, vals in counts.items():

            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("variableStep chrom=%s span=%i\n" %
                                  (contig, end - start + 1))
                    outfile.write("%i\t%i\n" % (start, val))

            ncontigs += 1
    elif options.output_format in ("bedgraph", "bigbed"):

        E.info("starting bedgraph output")

        for contig, vals in counts.items():
            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("%s\t%i\t%i\t%i\n" %
                                  (contig, start, end + 1, val))

            ncontigs += 1

    E.info("finished output")

    if options.output_format in ("bigwig", "bigbed"):
        outfile.close()

        E.info("starting bigwig conversion")
        try:
            retcode = subprocess.call(" ".join((executable,
                                                tmpfile_wig,
                                                tmpfile_sizes,
                                                os.path.abspath(options.output_filename)), ),
                                      shell=True)
            if retcode < 0:
                warn("wigToBigWig terminated with signal: %i" % -retcode)
                return -retcode
        except OSError, msg:
            warn("Error while executing bigwig: %s" % e)
            return 1

        shutil.rmtree(tmpdir)

        E.info("finished bigwig conversion")
Esempio n. 7
0
def pslMerge(options):
    """merge psl alignments.
    """

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    last_query = None
    last_target = None
    last_strand = None

    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(range(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(
                x, target, {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(("query=%s, target=%s, strand=%s: "
                    "removed overlapping/out-of-order segments: "
                    "before=%i, after=%i") %
                   (matches[0].mQueryId,
                    matches[0].mSbjctId,
                    matches[0].strand,
                    len(matches),
                    len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 10:
            options.stdlog.write("# input: %s\n" % (str(match)))

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mQueryId != last_query or\
           match.strand != last_strand or\
           match.mSbjctId != last_target:
            if last_query:
                noutput += process(matches)
            matches = []
            last_query, last_target, last_strand = (
                match.mQueryId, match.mSbjctId, match.strand)

        matches.append(match)

    if last_query:
        noutput += process(matches)

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 8
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-s",
        "--filename-strand",
        dest="filename_strand",
        type="string",
        help="set strand information according to file [default=%DEFAULT].")

    parser.set_defaults(as_gtf=False, filename_strand=None, test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ####################################
    if options.filename_strand:
        map_id2strand = IOTools.readMap(open(options.filename_strand, "r"))
    else:
        map_id2strand = {}

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    if options.as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "psl"
    gff.feature = "exon"

    ids = {}

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if match.mQueryId not in ids:
            ids[match.mQueryId] = 1
            id = match.mQueryId
        else:
            id = match.mQueryId + ":%i" % ids[match.mQueryId]
            ids[match.mQueryId] += 1

        if options.as_gtf:
            gff.contig = match.mSbjctId
            gff.gene_id = id
            gff.transcript_id = id
        else:
            gff.contig = match.mSbjctId
            gff.clearAttributes()
            gff.addAttribute("gene_id", id)

        if id in map_id2strand:
            gff.strand = map_id2strand[id]
        else:
            gff.strand = match.strand

        for qstart, sstart, size in match.getBlocks():

            gff.start = sstart
            gff.end = sstart + size
            options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries - required for polyA analysis [%default].")

    parser.add_option("--polyA", dest="polyA", action="store_true",
                      help="detect polyA tails [%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string",
                      help="OUTPUT filename with queries for which all matches have been discarded [%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("map", "psl"),
                      help="output format to choose [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float",
                      help="minimum thresholds for pid [%default].")

    parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int",
                      help="minimum threshold for number of matching residues [%default].")

    parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float",
                      help="maximum threshold for error of aligned part [%default].")

    parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float",
                      help="minimum query coverage for segments to be counted as good [%default].")

    parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float",
                      help="minimum query coverage for segments to be accepted [%default].")

    parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int",
                      help="maximum number of gap characters  in query[%default].")

    parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int",
                      help="maximum number of gaps  in query[%default].")

    parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int",
                      help="maximum number of gap characters  in sbjct[%default].")

    parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true",
                      help="ignore filters for unique matches [%default].")

    parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true",
                      help="when sorting matches, keep all matches within the collection threshold [%default].")

    parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true",
                      help="keep only the best entry per sbjct (for transcript mapping) [%default].")

    parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int",
                      help="maximum number of gaps  in sbjct[%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing[%default].")

    parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice",
                      choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage",
                               "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid",
                               "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid",
                               "unique", "all"),
                      help="determines how to selecte the best match [%default].")

    parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string",
                      help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].")

    parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true",
                      help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].")

    parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true",
                      help="use forward coordinates for query, strand will refer to sbjct [%default].")

    parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true",
                      help="if there are multiple best matches, ignore all those to chrUn and _random [%default].")

    parser.add_option("--collection-threshold", dest="collection_threshold", type="float",
                      help="threshold for collecting matches, percent of best score [%default].")

    parser.add_option("--collection-distance", dest="collection_distance", type="float",
                      help="threshold for collecting matches, difference to best score [%default].")

    parser.set_defaults(input_filename_domains=None,
                        input_filename_queries=None,
                        threshold_good_query_coverage=90.0,
                        threshold_min_pid=30.0,
                        threshold_min_matches=0,
                        threshold_max_error_rate=None,
                        output_filename_pattern="%s",
                        keep_unique_matches=False,
                        output_format="map",
                        print_matched=["full", "partial", "good"],
                        from_zipped=False,
                        combine_overlaps=True,
                        min_length_domain=30,
                        threshold_min_query_coverage=50,
                        min_length_singletons=30,
                        new_family_id=10000000,
                        add_singletons=False,
                        matching_mode="best-coverage",
                        best_per_sbjct=False,
                        threshold_max_query_gapchars=None,
                        threshold_max_query_gaps=None,
                        threshold_max_sbjct_gapchars=None,
                        threshold_max_sbjct_gaps=None,
                        filename_filter_sbjct=None,
                        keep_forbidden=False,
                        keep_all_best=False,
                        test=None,
                        query_forward_coordinates=False,
                        output_filename_empty=None,
                        collection_threshold=1.0,
                        collection_distance=0,
                        polyA=False,
                        # max residues missing from non polyA end
                        polyA_max_unaligned=3,
                        # min residues in tail
                        polyA_min_unaligned=10,
                        # min percent residues that are A/T in tail
                        polyA_min_percent=70.0,
                        # ignore duplicate matches if they are on Un or
                        # _random
                        ignore_all_random=False,
                        )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 1:
        if options.from_zipped or args[0][-3:] == ".gz":
            import gzip
            infile = gzip.open(args[0], "r")
        else:
            infile = IOTools.openFile(args[0], "r")
    else:
        infile = sys.stdin

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    if options.filename_filter_sbjct:

        try:
            import bx.intervals.intersection
        except ImportError:
            raise ValueError("filtering for intervals requires the bx tools")

        intervals = GTF.readGFFFromFileAsIntervals(
           IOTools.openFile(options.filename_filter_sbjct, "r"))

        intersectors = {}

        for contig, values in list(intervals.items()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in values:
                intersector.add_interval(bx.intervals.Interval(start, end))
            intersectors[contig] = intersector

        if options.loglevel >= 1:
            options.stdlog.write("# read %i intervals for %i contigs.\n" %
                                 (sum([len(x) for x in list(intervals.values())]),
                                  len(intersectors)))
    else:
        intersectors = None

    ################################################
    ################################################
    ################################################
    # processing of a chunk (matches of same query)
    ################################################
    ninput, noutput, nskipped = 0, 0, 0

    # number of sequences with full/partial/good matches
    nfull_matches, npartial_matches, ngood_matches = 0, 0, 0
    # number of sequences which are fully/good/partially matched
    # i.e., after combining all aligned regions
    nfully_matched, npartially_matched, nwell_matched = 0, 0, 0

    nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0
    nremoved_gaps, nremoved_nmatches = 0, 0
    nremoved_regions = 0
    nqueries_removed_region = 0

    aggregate_coverages = []
    mapped_coverages = []
    fully_matched = []
    well_matched = []
    partially_matched = []
    new_family_id = options.new_family_id

    if options.output_filename_empty:
        outfile_empty = IOTools.openFile(options.output_filename_empty, "w")
        outfile_empty.write("read_id\tcomment\n")
    else:
        outfile_empty = None

    if options.polyA:
        options.outfile_polyA = IOTools.openFile(
            options.output_filename_pattern % "polyA", "w")
        options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n")

    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * \
                    math.power(
                        options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        # Remove queries matching to a forbidden region. This section
        # will remove the full query if any of its matches matches in a
        # forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write(
                    "%s\toverlap with forbidden region\n" % query_id)
            return

        # check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        # compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * \
            (match.mQueryLength -
             sum([x[1] - x[0] for x in rest])) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        # select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write("%s\n" %
                                         "\t".join(map(str, (
                                             match.mQueryId, match.mSbjctId,
                                             match.strand,
                                             "%5.2f" % match.mQueryCoverage,
                                             "%5.2f" % match.mSbjctCoverage,
                                             "%5.2f" % match.mPid,
                                             match.mQueryLength,
                                             match.mSbjctLength,
                                             match.mQueryFrom, match.mQueryTo,
                                             match.mSbjctFrom, match.mSbjctTo,
                                             ",".join(
                                                 map(str, match.mBlockSizes)),
                                             ",".join(
                                                 map(str, match.mQueryBlockStarts)),
                                             ",".join(
                                                 map(str, match.mSbjctBlockStarts)),
                                         ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write(
                    "%s\tno matches selected: %s\n" % (query_id, msg))
            nempty += 1

    if options.output_format == "map":
        options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage",
                                        "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n")
    elif options.output_format == "psl":
        options.stdout.write(Blat.Match().getHeader() + "\n")

    ################################################
    ################################################
    ################################################
    # main loop
    ################################################
    nfully_covered = None
    matches = []
    last_query_id = None
    is_complete = True
    ninput_lines = 0

    skip = 0

    iterator = Blat.BlatIterator(infile)

    while 1:

        try:
            match = next(iterator)
        except Blat.ParsingError:
            iterator = Blat.BlatIterator(infile)
            continue

        if match is None:
            break

        ninput_lines += 1

        if options.test and ninput_lines > options.test:
            break

        if match.mQueryId != last_query_id:
            if last_query_id:
                processChunk(last_query_id, matches)
            matches = []
            last_query_id = match.mQueryId

        matches.append(match)

    processChunk(last_query_id, matches)

    printHistogram(aggregate_coverages, "aggregate", options)

    printHistogram(mapped_coverages, "mapped", options)

    if "full" in options.print_matched:
        printMatched(fully_matched, "full", options)

    if "good" in options.print_matched:
        printMatched(well_matched, "good", options)

    if "partial" in options.print_matched:
        printMatched(partially_matched, "partial", options)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete)))
        options.stdlog.write(
            "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput))
        options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % (
            nfull_matches, ngood_matches, npartial_matches))
        options.stdlog.write("# aggregate  coverage: full=%i, good=%i, partial=%i\n" % (
            len(fully_matched), len(well_matched), len(partially_matched)))
        options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %
                             (nskipped + nqueries_removed_region + nempty,
                              nskipped, nqueries_removed_region, nempty))
        options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % (
            nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches))

    E.Stop()
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--wiggle-files", dest="wiggle_files", type="string",
                      help="glob expression for wiggle files [%default].")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix to add to contig names before lookup [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing [%default].")

    parser.add_option("--with-values", dest="with_values", action="store_true",
                      help="output values in last column [%default].")

    parser.set_defaults(wiggle_files="*.data.bz2",
                        from_zipped=False,
                        prefix="",
                        with_values=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    # open indexed access to wiggles
    wiggle_files = glob.glob(options.wiggle_files)
    if not wiggle_files:
        raise IOError("could not find wiggle files with '%s'" %
                      options.wiggle_files)

    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=False)

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders())))
    if options.with_values:
        options.stdout.write("\tvalues")
    options.stdout.write("\n")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if options.loglevel >= 2:
            options.stdlog.write(str(match) + "\n")

        # psl always matches on the forward strand

        map_genome2query = alignlib_lite.py_makeAlignmentBlocks()
        f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (
            match.mSbjctFrom,
            match.mSbjctTo,
            match.mQueryFrom,
            match.mQueryTo,
            match.mSbjctBlockStarts,
            match.mQueryBlockStarts,
            match.mBlockSizes))
        f.copy(map_genome2query)

        data = index.get(options.prefix + match.mSbjctId,
                         match.mSbjctFrom,
                         match.mSbjctTo)

        values = []
        for x, vv in data:
            for v in vv:
                if map_genome2query.mapRowToCol(x) >= 0:
                    values.append(v)
                x += 1
        if len(values) == 0:
            nskipped += 1
            continue

        noutput += 1

        if options.loglevel >= 2:
            options.stdlog.write(
                "# %s\n" % ",".join(["%5.3f" % v for v in values]))

        s = Stats.DistributionalParameters(values)
        options.stdout.write("%s\t%i\t%s" % (match.mQueryId,
                                             match.mNMismatches +
                                             match.mNMatches,
                                             str(s)))

        if options.with_values:
            options.stdout.write(
                "\t%s" % ",".join(["%5.3f" % v for v in values]))

        options.stdout.write("\n")

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()