def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser.add_option("-n", "--vertices", dest="vertices", action="append", help="filename with vertices.") parser.add_option("-e", "--extra", dest="filename_extra", type="string", help="filename to store extra vertices in.") parser.add_option("-m", "--missed", dest="filename_missed", type="string", help="filename to store missed vertices in.") parser.set_defaults( vertices=[], filename_extra=None, filename_missed=None, ) (options, args) = E.Start(parser) if len(options.vertices) == "": raise "please specify one set of vertices." vertices = {} index = 0 missed_queries = [] nvertices = [0] * len(options.vertices) for x in range(len(options.vertices)): f = options.vertices[x] vv = map(lambda x: x[:-1].split("\t")[0], filter(lambda x: x[0] != "#", open(f, "r").readlines())) nvertices[x] = len(vv) missed_queries.append([]) for v in vv: vertices[v] = [x, 0, 0] if options.loglevel >= 1: print "# read %i vertices from %s" % (len(vv), f) sys.stdout.flush() l = BlastAlignments.Link() extra_vertices = {} for line in sys.stdin: if line[0] == "#": continue l.Read(line) if l.mQueryToken in vertices: vertices[l.mQueryToken][1] += 1 else: extra_vertices[l.mQueryToken] = 1 if l.mSbjctToken in vertices: vertices[l.mSbjctToken][2] += 1 else: extra_vertices[l.mSbjctToken] = 1 found_queries = [0] * len(options.vertices) found_sbjcts = [0] * len(options.vertices) for v, vv in vertices.items(): index, nquery, nsbjct = vv if nquery: found_queries[index] += 1 else: missed_queries[index].append(v) if nsbjct: found_sbjcts[index] += 1 headers = ("set", "name", "tvertex", "nmissed", "pmissed", "nquery", "pquery", "nsbjct", "psbjct") print "\t".join(headers) for x in range(len(options.vertices)): print "%i\t%s\t%i\t%i\t%5.2f\t%i\t%5.2f\t%i\t%5.2f" % ( x, options.vertices[x], nvertices[x], len(missed_queries[x]), 100 * float(len(missed_queries[x])) / nvertices[x], found_queries[x], 100 * float(found_queries[x]) / nvertices[x], found_sbjcts[x], 100 * float(found_sbjcts[x]) / nvertices[x]) print "//" print "%i vertices not in set" % len(extra_vertices) if options.filename_extra and len(extra_vertices) > 0: outfile = open(options.filename_extra, "w") for x in extra_vertices.keys(): outfile.write("%s\n" % x) outfile.close() if options.filename_missed: outfile = open(options.filename_missed, "w") for x in range(len(options.vertices)): for y in missed_queries[x]: outfile.write("%i\t%s\t%s\n" % (x, options.vertices[x], y)) E.Stop()
if param_loglevel >= 1: print "# read %i cds" % (len(cds)) sys.stdout.flush() ninput, npairs, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": print line[:-1] continue ninput += 1 link = BlastAlignments.Link() link.Read(line) if link.mQueryToken == link.mSbjctToken: continue keep = 1 if link.mQueryToken in cds and link.mSbjctToken in cds: is_paralog, reason = IsParalogLink(link, cds[link.mQueryToken], cds[link.mSbjctToken]) if is_paralog: keep = 0 if param_loglevel >= 2: print "# DISCARDED because %s: %s" % (reason, str(link)) else:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: blast2fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="filename with sequences.") parser.add_option("-f", "--format", dest="format", type="string", help="output format.") parser.set_defaults( filename_sequences=None, format="fasta", ) (options, args) = E.Start(parser) if not options.filename_sequences: raise "please supply filename with sequences." sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) if options.loglevel >= 1: print "# read %i sequences" % len(sequences) for k in sequences.keys(): sequences[k] = alignlib_lite.py_makeSequence(sequences[k]) if options.loglevel >= 2: print "# converted %i sequences" % len(sequences) ninput, noutput, nskipped, nfailed = 0, 0, 0, 0 link = BlastAlignments.Link() ali = alignlib_lite.py_makeAlignataVector() for line in sys.stdin: if line[0] == "#": continue link.Read(line) ninput += 1 if link.mQueryToken not in sequences or link.mSbjctToken not in sequences: nskipped += 1 continue ali.Clear() alignlib_lite.py_fillAlignataCompressed(ali, link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli) result = alignlib_lite.py_writePairAlignment( sequences[link.mQueryToken], sequences[link.mSbjctToken], ali).split("\n") if len(result) != 3: nfailed += 1 if options.format == "fasta": print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\ (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1], link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] ) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" % (ninput, noutput, nskipped, nfailed)) E.Stop()