Example #1
0
def pslAddSequence(query_fasta, sbjct_fasta, options):

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    while 1:

        match = next(iterator)
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        new = Blat.MatchPSLX()
        new.fromPSL(match,
                    query_fasta.getSequence(
                        match.mQueryId, "+", match.mQueryFrom, match.mQueryTo),
                    sbjct_fasta.getSequence(
                        match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo))

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()