Example #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-n",
                      "--vertices",
                      dest="vertices",
                      action="append",
                      help="filename with vertices.")
    parser.add_option("-e",
                      "--extra",
                      dest="filename_extra",
                      type="string",
                      help="filename to store extra vertices in.")
    parser.add_option("-m",
                      "--missed",
                      dest="filename_missed",
                      type="string",
                      help="filename to store missed vertices in.")

    parser.set_defaults(
        vertices=[],
        filename_extra=None,
        filename_missed=None,
    )

    (options, args) = E.Start(parser)

    if len(options.vertices) == "":
        raise "please specify one set of vertices."

    vertices = {}
    index = 0
    missed_queries = []
    nvertices = [0] * len(options.vertices)
    for x in range(len(options.vertices)):
        f = options.vertices[x]
        vv = map(lambda x: x[:-1].split("\t")[0],
                 filter(lambda x: x[0] != "#",
                        open(f, "r").readlines()))
        nvertices[x] = len(vv)
        missed_queries.append([])
        for v in vv:
            vertices[v] = [x, 0, 0]
        if options.loglevel >= 1:
            print "# read %i vertices from %s" % (len(vv), f)
            sys.stdout.flush()

    l = BlastAlignments.Link()
    extra_vertices = {}
    for line in sys.stdin:

        if line[0] == "#":
            continue

        l.Read(line)

        if l.mQueryToken in vertices:
            vertices[l.mQueryToken][1] += 1
        else:
            extra_vertices[l.mQueryToken] = 1

        if l.mSbjctToken in vertices:
            vertices[l.mSbjctToken][2] += 1
        else:
            extra_vertices[l.mSbjctToken] = 1

    found_queries = [0] * len(options.vertices)
    found_sbjcts = [0] * len(options.vertices)

    for v, vv in vertices.items():
        index, nquery, nsbjct = vv
        if nquery:
            found_queries[index] += 1
        else:
            missed_queries[index].append(v)

        if nsbjct:
            found_sbjcts[index] += 1

    headers = ("set", "name", "tvertex", "nmissed", "pmissed", "nquery",
               "pquery", "nsbjct", "psbjct")

    print "\t".join(headers)

    for x in range(len(options.vertices)):
        print "%i\t%s\t%i\t%i\t%5.2f\t%i\t%5.2f\t%i\t%5.2f" % (
            x, options.vertices[x], nvertices[x], len(missed_queries[x]),
            100 * float(len(missed_queries[x])) / nvertices[x],
            found_queries[x], 100 * float(found_queries[x]) / nvertices[x],
            found_sbjcts[x], 100 * float(found_sbjcts[x]) / nvertices[x])

    print "//"
    print "%i vertices not in set" % len(extra_vertices)

    if options.filename_extra and len(extra_vertices) > 0:
        outfile = open(options.filename_extra, "w")
        for x in extra_vertices.keys():
            outfile.write("%s\n" % x)
        outfile.close()

    if options.filename_missed:
        outfile = open(options.filename_missed, "w")
        for x in range(len(options.vertices)):
            for y in missed_queries[x]:
                outfile.write("%i\t%s\t%s\n" % (x, options.vertices[x], y))

    E.Stop()
    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue
        if line[0] == ">":
            print line[:-1]
            continue

        ninput += 1
        link = BlastAlignments.Link()

        link.Read(line)

        if link.mQueryToken == link.mSbjctToken:
            continue

        keep = 1
        if link.mQueryToken in cds and link.mSbjctToken in cds:
            is_paralog, reason = IsParalogLink(link, cds[link.mQueryToken],
                                               cds[link.mSbjctToken])
            if is_paralog:
                keep = 0
                if param_loglevel >= 2:
                    print "# DISCARDED because %s: %s" % (reason, str(link))
        else:
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: blast2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="filename with sequences.")
    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format.")

    parser.set_defaults(
        filename_sequences=None,
        format="fasta",
    )

    (options, args) = E.Start(parser)

    if not options.filename_sequences:
        raise "please supply filename with sequences."

    sequences = Genomics.ReadPeptideSequences(
        open(options.filename_sequences, "r"))

    if options.loglevel >= 1:
        print "# read %i sequences" % len(sequences)

    for k in sequences.keys():
        sequences[k] = alignlib_lite.py_makeSequence(sequences[k])

    if options.loglevel >= 2:
        print "# converted %i sequences" % len(sequences)

    ninput, noutput, nskipped, nfailed = 0, 0, 0, 0
    link = BlastAlignments.Link()

    ali = alignlib_lite.py_makeAlignataVector()

    for line in sys.stdin:

        if line[0] == "#": continue

        link.Read(line)
        ninput += 1

        if link.mQueryToken not in sequences or link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        ali.Clear()
        alignlib_lite.py_fillAlignataCompressed(ali, link.mQueryFrom,
                                                link.mQueryAli,
                                                link.mSbjctFrom,
                                                link.mSbjctAli)

        result = alignlib_lite.py_writePairAlignment(
            sequences[link.mQueryToken], sequences[link.mSbjctToken],
            ali).split("\n")

        if len(result) != 3:
            nfailed += 1

        if options.format == "fasta":
            print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\
                  (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1],
                   link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] )

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" %
           (ninput, noutput, nskipped, nfailed))
    E.Stop()