def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def annotateAlignmentGraph(infile, outfiles): '''input the alignment graph and output a translated version of it and adding reference domain information. ''' outfile, outfile_stats = outfiles # collect benchmark domains E.info("reading benchmark domains") benchmark_domains = AddaIO.readMapNid2Domains( gzip.open(PARAMS["eval_filename_benchmark_domains"])) totuple = AddaIO.toTuple toDomain = AddaIO.toDomain # build map of id to nid E.info("reading map between pid and nid") map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r")) def getOverlappingDomains(pid, start, end): '''get domains overlapping pid:start..end''' if pid not in benchmark_domains: return () # greedy overlap testing r = [] for family, domains in benchmark_domains[pid].iteritems(): for other_start, other_end in domains: if start >= other_end or end <= other_start: continue r.append((family, other_start, other_end)) return r counts = E.Counter() if infile.endswith(".gz"): inf = gzip.open(infile) else: inf = open(infile) outf = gzip.open(outfile, "w") outf.write("%s\n" % "\t".join( ("passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore", "rfamilies", "sfamilies", "rdomains", "sdomains"))) # counts for true positives, false positives and unknown n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0 outf_stats = open(outfile_stats, "w") outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n") last_weight = None for link in AddaIO.iterate_tested_links(inf): qnid, qstart, qend = totuple(link.qdomain) snid, sstart, send = totuple(link.sdomain) qpid = map_nid2pid[qnid] spid = map_nid2pid[snid] qfamily = sorted(getOverlappingDomains(qpid, qstart, qend)) sfamily = sorted(getOverlappingDomains(spid, sstart, send)) passed = link.passed == "+" n += 1 if not qfamily and not sfamily: uk += 1 else: qf = set([x[0] for x in qfamily]) sf = set([x[0] for x in sfamily]) if qf.intersection(sf): if passed: tp += 1 else: fn += 1 else: if passed: fp += 1 else: tn += 1 weight = round(float(link.weight)) if weight != last_weight: if last_weight != None: outf_stats.write("\t".join( map(str, ( last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp + fp), float(fn) / (fn + tn + 0.00001), ))) + "\n") last_weight = weight if passed: counts.passed += 1 else: counts.failed += 1 link = link._replace(qdomain=toDomain((qpid, qstart, qend)), sdomain=toDomain((spid, sstart, send))) outf.write( "%s\t%s\t%s\t%s\t%s\n" % \ ("\t".join( map(str,link) ), ",".join( sorted(set([x[0] for x in qfamily])) ), ",".join( sorted(set([x[0] for x in sfamily])) ), ",".join("%s_%i_%i" % x for x in qfamily ), ",".join("%s_%i_%i" % x for x in sfamily ))) inf.close() outf_stats.write("\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp + fp), float(fn) / (fn + tn)))) + "\n") outf_stats.close() E.info("%s" % str(counts))
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option( "-o", "--format", dest="graph-format", type="choice", choices=("alignments",), help="graph format [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("shortest-path", "translate", "components", "add-family" ), help="methods to apply [default=%default].") parser.add_option( "-a", "--filename-map", dest="filename_map", type="string", help="filename mapping ids to nids (used for translation) [default=%default].") parser.add_option( "-1", "--node1", dest="node1", type="string", help="first node for path calculation [default=%default].") parser.add_option( "-2", "--node2", dest="node2", type="string", help="second node for path calculation [default=%default].") parser.add_option( "-f", "--filename-families", dest="filename_families", type="string", help="filename with domain families [default=%default].") parser.set_defaults( method = None, graph_format = "alignments", filename_map = None, node1 = None, node2 = None, filename_families = None, ) (options, args) = E.Start( parser, argv = argv ) if options.filename_families != None: E.info( "reading families from %s" % options.filename_families ) map_domain2family = {} for line in open( options.filename_families, "r"): if line[0] == "#": continue if line.startswith( "nid"): continue nid, start, end, family = line[:-1].split("\t") pid = bytes("%s_%s_%s" % (nid,start,end)) map_domain2family[pid] = bytes(family) E.info( "read %i domains" % len(map_domain2family)) if options.method == "translate": if options.filename_map: E.info("reading map from %s" % options.filename_map) map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") ) map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()]) def translate_alignments( line ): if line.startswith("passed"): return line data = line.split( "\t" ) x = data[1].split("_") y = data[2].split("_") try: data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(x) ) raise try: data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(y) ) raise return "\t".join(data) if options.graph_format == "alignments": translator = translate_alignments for line in options.stdin: if not line.startswith("#"): line = translator( line ) options.stdout.write(line) E.Stop() return elif options.method == "add-family": options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields))) for link in AddaIO.iterate_tested_links( options.stdin ): qfamily = map_domain2family.get(link.qdomain,"na") sfamily = map_domain2family.get(link.sdomain,"na") options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), qfamily, sfamily)) E.Stop() return t = time.time() if options.graph_format == "alignments": map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin ) E.info( "graph read in %i seconds" % (time.time() - t )) t = time.time() if options.method == "shortest-path": E.debug( "shortest path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) paths = G.get_shortest_paths( map_vertex2id[options.node1], to = (map_vertex2id[options.node2],) ) p = paths[map_vertex2id[options.node2]] if len(p) == 0: E.info( "no path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) l, last_node = p[0], map_id2vertex[p[0]] for x in p[1:]: node = map_id2vertex[x] ei = G.get_eid(x, l) options.stdout.write( "%s\t%s\t%s\n" %\ (last_node, node, G.es[ei]["info"]) ) l, last_node = x, node elif options.method == "components": print "component\tnode" for id, component in enumerate(nx.connected_components( G )): for c in component: print "%i\t%s" % (id,c) E.info( "%s: %i seconds" % (options.method, time.time() - t )) E.Stop()
def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"]) parser.add_option( "-n", "--nids", dest="filename_nids", type="string", help="filename with nids[default=%default].") parser.add_option( "-c", "--column", dest="columns", type="int", action="append", help="columns with nids to translate (1-based) [default=%default].") parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true", help="translate domain ids [default=%default].") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="invert mapping [default=%default].") parser.add_option( "-e", "--no-header", dest="no_header", action="store_true", help="file has no header [default=%default].") parser.set_defaults( filename_nids = "adda.nids", columns = [], is_domains = False, invert = False, noheader = False, ) (options, args) = E.Start( parser ) map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") ) if options.invert: E.info( "inverting mapping" ) map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] ) if len(options.columns) == 0: options.columns = [1] columns = [x-1 for x in options.columns ] toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain first = not options.no_header is_domains = options.is_domains ninput, noutput, nskipped = 0, 0, 0 for line in options.stdin: if line.startswith("#"): options.stdout.write(line) continue if first: options.stdout.write(line) first = False continue ninput += 1 data = line[:-1].split("\t") for x in columns: if is_domains: try: d = toTuple(data[x]) except ValueError: E.warn( "could not parse domain `%s`" % data[x]) nskipped += 1 break try: data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) ) except (IndexError, KeyError): E.warn( "could not map domain `%s`" % data[x]) nskipped += 1 break else: try: data[x] = str(map_nid2pid[int(data[x])]) except IndexError: E.warn( "could not map nid `%s`" % data[x]) nskipped += 1 break else: options.stdout.write("%s\n" % "\t".join(data)) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def annotateAlignmentGraph( infile, outfiles ): '''input the alignment graph and output a translated version of it and adding reference domain information. ''' outfile, outfile_stats = outfiles # collect benchmark domains E.info( "reading benchmark domains" ) benchmark_domains = AddaIO.readMapNid2Domains( gzip.open( PARAMS["eval_filename_benchmark_domains"] ) ) totuple = AddaIO.toTuple toDomain = AddaIO.toDomain # build map of id to nid E.info( "reading map between pid and nid" ) map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r") ) def getOverlappingDomains( pid, start, end ): '''get domains overlapping pid:start..end''' if pid not in benchmark_domains: return () # greedy overlap testing r = [] for family, domains in benchmark_domains[pid].iteritems(): for other_start, other_end in domains: if start >= other_end or end <= other_start: continue r.append( (family, other_start, other_end) ) return r counts = E.Counter() if infile.endswith(".gz"): inf = gzip.open( infile ) else: inf = open(infile) outf = gzip.open( outfile, "w" ) outf.write( "%s\n" % "\t".join( ( "passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore", "rfamilies", "sfamilies", "rdomains", "sdomains")) ) # counts for true positives, false positives and unknown n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0 outf_stats = open( outfile_stats, "w" ) outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n" ) last_weight = None for link in AddaIO.iterate_tested_links( inf ): qnid, qstart, qend = totuple(link.qdomain) snid, sstart, send = totuple(link.sdomain) qpid = map_nid2pid[qnid] spid = map_nid2pid[snid] qfamily = sorted(getOverlappingDomains( qpid, qstart, qend )) sfamily = sorted(getOverlappingDomains( spid, sstart, send )) passed = link.passed == "+" n += 1 if not qfamily and not sfamily: uk += 1 else: qf = set( [x[0] for x in qfamily] ) sf = set( [x[0] for x in sfamily] ) if qf.intersection( sf ): if passed: tp += 1 else: fn += 1 else: if passed: fp += 1 else: tn += 1 weight = round(float(link.weight)) if weight != last_weight: if last_weight != None: outf_stats.write( "\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp+fp), float(fn) / (fn+tn+0.00001), ) ) ) + "\n" ) last_weight = weight if passed: counts.passed += 1 else: counts.failed += 1 link = link._replace( qdomain=toDomain( (qpid, qstart, qend) ), sdomain=toDomain( (spid, sstart, send) )) outf.write( "%s\t%s\t%s\t%s\t%s\n" % \ ("\t".join( map(str,link) ), ",".join( sorted(set([x[0] for x in qfamily])) ), ",".join( sorted(set([x[0] for x in sfamily])) ), ",".join("%s_%i_%i" % x for x in qfamily ), ",".join("%s_%i_%i" % x for x in sfamily ))) inf.close() outf_stats.write( "\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp+fp), float(fn) / (fn+tn) ) ) ) + "\n" ) outf_stats.close() E.info( "%s" % str( counts ) )
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()