def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--no-swissprot-version", dest="no_swissprot_version", action="store_true", help="remove swissprot version information [%default]") parser.add_option("--no-pfam-version", dest="no_pfam_version", action="store_true", help="remove pfam version information [%default]") parser.add_option("--prefix", dest="prefix", type="string", help="add prefix to id [%default]") parser.set_defaults(no_swissprot_version=False, no_pfam_version=False, prefix="") (options, args) = E.Start(parser) rx_head = re.compile(">(\S+)\s+\S+\| (\S+) (\d+) a.a.") rx_domain = re.compile("(\S+) .* (PF\d+.\d+) (.*) (.*)") options.stdout.write("nid\tstart\tend\tfamily\n") ninput, noutput, ndomains, nskipped = 0, 0, 0, 0 for record in record_iterator(sys.stdin): ninput += 1 try: id, acc, len = rx_head.match(record[0]).groups() except AttributeError, msg: E.warn("parsing error in line `%s`" % record[0]) nskipped += 1 continue if options.no_swissprot_version: acc = acc.split(".")[0] for line in record[1:]: # no Pfam-B if line.startswith("Pfam-B"): continue name, family, description, coordinates = rx_domain.match( line).groups() for c in coordinates.split(" "): start, end = [int(x) for x in c.split("-")] start -= 1 options.stdout.write( options.prefix + "\t".join(map(str, (acc, start, end, family))) + "\n") ndomains += 1 noutput += 1
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option( "-o", "--format", dest="graph-format", type="choice", choices=("alignments",), help="graph format [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("shortest-path", "translate", "components", "add-family" ), help="methods to apply [default=%default].") parser.add_option( "-a", "--filename-map", dest="filename_map", type="string", help="filename mapping ids to nids (used for translation) [default=%default].") parser.add_option( "-1", "--node1", dest="node1", type="string", help="first node for path calculation [default=%default].") parser.add_option( "-2", "--node2", dest="node2", type="string", help="second node for path calculation [default=%default].") parser.add_option( "-f", "--filename-families", dest="filename_families", type="string", help="filename with domain families [default=%default].") parser.set_defaults( method = None, graph_format = "alignments", filename_map = None, node1 = None, node2 = None, filename_families = None, ) (options, args) = E.Start( parser, argv = argv ) if options.filename_families != None: E.info( "reading families from %s" % options.filename_families ) map_domain2family = {} for line in open( options.filename_families, "r"): if line[0] == "#": continue if line.startswith( "nid"): continue nid, start, end, family = line[:-1].split("\t") pid = bytes("%s_%s_%s" % (nid,start,end)) map_domain2family[pid] = bytes(family) E.info( "read %i domains" % len(map_domain2family)) if options.method == "translate": if options.filename_map: E.info("reading map from %s" % options.filename_map) map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") ) map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()]) def translate_alignments( line ): if line.startswith("passed"): return line data = line.split( "\t" ) x = data[1].split("_") y = data[2].split("_") try: data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(x) ) raise try: data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(y) ) raise return "\t".join(data) if options.graph_format == "alignments": translator = translate_alignments for line in options.stdin: if not line.startswith("#"): line = translator( line ) options.stdout.write(line) E.Stop() return elif options.method == "add-family": options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields))) for link in AddaIO.iterate_tested_links( options.stdin ): qfamily = map_domain2family.get(link.qdomain,"na") sfamily = map_domain2family.get(link.sdomain,"na") options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), qfamily, sfamily)) E.Stop() return t = time.time() if options.graph_format == "alignments": map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin ) E.info( "graph read in %i seconds" % (time.time() - t )) t = time.time() if options.method == "shortest-path": E.debug( "shortest path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) paths = G.get_shortest_paths( map_vertex2id[options.node1], to = (map_vertex2id[options.node2],) ) p = paths[map_vertex2id[options.node2]] if len(p) == 0: E.info( "no path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) l, last_node = p[0], map_id2vertex[p[0]] for x in p[1:]: node = map_id2vertex[x] ei = G.get_eid(x, l) options.stdout.write( "%s\t%s\t%s\n" %\ (last_node, node, G.es[ei]["info"]) ) l, last_node = x, node elif options.method == "components": print "component\tnode" for id, component in enumerate(nx.connected_components( G )): for c in component: print "%i\t%s" % (id,c) E.info( "%s: %i seconds" % (options.method, time.time() - t )) E.Stop()
def main(): global L parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--config", dest="filename_config", type="string", help="configuration file [default=%default].") parser.add_option("--force", dest="force", action="store_true", help="overwrite existing files [default=%default].") parser.add_option( "--continue", dest="append", action="store_true", help= "continue from an aborted run and append to existing files [default=%default]." ) parser.add_option( "--test", dest="test", type="int", help="run a test with first # sequences [default=%default]") parser.add_option( "--num-jobs", dest="num_jobs", type="int", help= "use # processes. If not set, the number of CPUs/cores is taken [default=%default]" ) parser.add_option( "--chunks", dest="chunks", type="string", help= "work on one or more chunks only. Provide a comma-separated list. [default=%default]" ) parser.add_option("--command", dest="command", type="choice", choices=("sequences", "blast", "fit", "graph", "index", "check-index", "profiles", "segment", "optimise", "convert", "mst", "mst-components", "align", "cluster", "realign", "families", "stats", "summary"), help="perform a command [default=%default]") parser.add_option("--start-at", dest="start_at", type="string", help="start at sequence [default=%default]") parser.add_option("--stop-at", dest="stop_at", type="string", help="stop at sequenec [default=%default]") parser.set_defaults( filename_config="adda.ini", command=None, start_at=None, stop_at=None, force=False, append=False, test=None, num_jobs=None, chunks="all", ) (options, args) = E.Start(parser) # setup logging if options.loglevel == 0: lvl = logging.ERROR elif options.loglevel == 1: lvl = logging.INFO else: lvl = logging.DEBUG logQueue = multiprocessing.Queue(100) handler = Logger.MultiProcessingLogHandler( logging.FileHandler("adda.log", "a"), logQueue) handler.setFormatter( logging.Formatter( '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M')) logging.getLogger('adda').addHandler(handler) logging.getLogger('adda').setLevel(lvl) E.setLogger(logging.getLogger("adda")) L = logging.getLogger("adda") config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) if len(args) == 0: if not options.command: raise ValueError("specify at least one command") elif len(args) == 1: options.command = args[0] else: raise ValueError("one command line argument is sufficient.") ## collect modules and initialise them map_module = { 'fit': AddaFit.AddaFit, 'segment': AddaSegment.AddaSegment, 'blast': AddaBlast.AddaBlast, 'graph': AddaGraph.AddaGraph, 'stats': AddaStats.AddaStats, 'profiles': AddaProfiles.AddaProfiles, 'realign': AddaAlign.AddaRealign, 'index': AddaIndex.AddaIndexBuild, 'check-index': AddaIndex.AddaIndexCheck, 'optimise': AddaOptimise.AddaOptimise, 'sequences': AddaSequences.AddaSequences, 'convert': AddaConvert.AddaConvert, 'mst': AddaMst.AddaMst, 'mst-components': AddaComponentsMst.AddaComponentsMst, 'align': AddaAlign.AddaAlign, 'cluster': AddaCluster.AddaCluster, 'families': AddaFamilies.AddaFamilies, 'summary': AddaSummary.AddaSummary, } try: fasta = IndexedFasta.IndexedFasta( config.get("files", "output_fasta", "adda")) except KeyError: fasta = None if options.num_jobs == 1: run_parallel = runSequentially else: run_parallel = runParallel kwargs = { "loglevel": options.loglevel, "append": options.append, "force": options.force } if options.command == "index": module = map_module[options.command](config, fasta=fasta, **kwargs) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command) else: filename_graph = config.get("files", "input_graph", "pairsdb_40x40.links.gz") if "," in filename_graph: filename_graph = filename_graph.split(",") # permit parallel processing of multiple files run_parallel( run_on_files, filename=filename_graph, options=options, module=map_module[options.command], config=config, kwargs=kwargs, ) nchunks = len(filename_graph) module = map_module[options.command](config, chunk=0, num_chunks=nchunks, **kwargs) if not module.isComplete(): L.info("merging") if not module.merge(): raise ValueError("error while merging for `%s`" % options.command) else: # process single file - no hazzle. module.startUp() module.run() module.finish() if options.command in ("sequences", "stats", "optimise", "convert", "mst", "mst-components", "cluster", "families", "summary"): module = map_module[options.command](config, fasta=fasta, **kwargs) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command) else: module.startUp() module.run() module.finish() elif options.command in ("fit", "segment"): run_on_graph = RunOnGraph(config, options.command) run_parallel(run_on_graph, filename=config.get("files", "input_graph", "adda.graph"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) if not merge(options, module=map_module[options.command], config=config, fasta=fasta): E.Stop() return elif options.command in ("align"): run_parallel(run_on_file, filename=config.get("files", "output_mst", "adda.mst"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) merge(options, module=map_module[options.command], config=config, fasta=fasta) elif options.command in ("realign"): run_parallel(run_on_file, filename=config.get("files", "output_align", "adda.align"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) merge(options, module=map_module[options.command], config=config, fasta=fasta)
dest="format", type="choice", choices=("graph", "nodelist"), help="input format [default=%default]") parser.set_defaults( multi_labels=None, legend=None, label1=3, label2=4, attributes=[], format="graph", label="info1", ) (options, args) = E.Start(parser) take = (0, 1, options.label1 - 1, options.label2 - 1) if len(options.attributes) == 0: raise ValueError("please provide at least one attribute") options.stdout.write("node\t%s\t%s\n" % ("\t".join(options.attributes), options.label)) # build attributes attributes, default = [], [] for attribute in options.attributes: if attribute in ("colour", "color"): attributes.append(colors) default.append("white")
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--dialect", dest="dialect", type="string", help="csv dialect to use [default=%default].") parser.add_option( "-m", "--map", dest="map", type="string", action="append", help= "explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." ) parser.add_option("-t", "--table", dest="tablename", type="string", help="table name for all backends [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="database name for sqlite3 [default=%default].") parser.add_option("-l", "--lowercase", dest="lowercase", action="store_true", help="force lower case column names [default=%default].") parser.add_option( "-u", "--ignore-duplicates", dest="ignore_duplicates", action="store_true", help="ignore columns with duplicate names [default=%default].") parser.add_option( "-s", "--ignore-same", dest="ignore_same", action="store_true", help="ignore columns with identical values [default=%default].") parser.add_option( "-e", "--ignore-empty", dest="ignore_empty", action="store_true", help="ignore columns which are all empty [default=%default].") parser.add_option( "-q", "--quick", dest="insert_quick", action="store_true", help= "try quick file based import - needs to be supported by the backend [default=%default]." ) parser.add_option("-b", "--backend", dest="backend", type="choice", choices=("pg", "sqlite", "mysql"), help="database backend to choose [default=%default].") parser.add_option( "-i", "--index", dest="indices", type="string", action="append", help="create an index for the named column [default=%default].") parser.add_option("-a", "--allow-empty", dest="allow_empty", action="store_true", help="allow empty table [default=%default].") parser.add_option("--force-single", dest="force_single", action="store_true", help="force upload line by line [default=%default].") parser.set_defaults( map=[], dialect="excel-tab", database="csvdb", lowercase=False, tablename="csv", from_file=False, ignore_duplicates=False, ignore_identical=False, ignore_empty=False, insert_many=False, force_single=False, guess_size=1000, report_step=10000, backend="pg", indices=[], missing_values=( "na", "NA", ), insert_quick=False, allow_empty=False, ) (options, args) = E.Start(parser, add_psql_options=True, add_mysql_options=True) options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} index_mangle = str if options.backend == "pg": import pgdb dbhandle = pgdb.connect(options.psql_connection) error = pgdb.DatabaseError options.null = "NULL" options.string_value = "'%s'" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database) error = sqlite3.OperationalError options.insert_many = not options.force_single options.null = None # "NULL" options.string_value = "%s" # "'%s'" elif options.backend == "mysql": import MySQLdb, _mysql error = (_mysql.OperationalError, _mysql.ProgrammingError) if options.port: dbhandle = MySQLdb.connect(host=options.host, user=options.user, passwd=options.password, db=options.database, port=options.port) else: dbhandle = MySQLdb.connect(host=options.host, user=options.user, passwd=options.password, db=options.database, unix_socket=options.socket) options.insert_many = False # not options.force_single, fails with error options.null = "NULL" options.string_value = "'%s'" index_mangle = lambda x: re.sub("[.]", "_", x) reader = CSV.DictReader(sys.stdin, dialect=options.dialect) rows = [] for row in reader: try: rows.append(CSV.ConvertDictionary(row, map=options.map)) except TypeError, msg: E.warn( "incomplete line? Type error in conversion: '%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main(argv=sys.argv): parser = optparse.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-D", "--database", dest="database", type="string", help="tablename to use [default=%default].") parser.add_option("-t", "--trees", dest="table_name_trees", type="string", help="tablename with trees [default=%default].") parser.add_option("-r", "--parts", dest="table_name_parts", type="string", help="tablename with trees [default=%default].") parser.add_option( "-b", "--bench", dest="table_name_bench", type="string", help= "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]." ) parser.add_option( "-f", "--reference", dest="table_name_reference", type="string", help= "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]." ) parser.add_option("--bin-size", dest="bin_size", type="int", help="bin size [default=%default].") parser.add_option( "-o", "--resolution", dest="resolution", type="float", help="resolution for scaling of domains [default=%default].") parser.add_option( "-s", "--switch", dest="switch", action="store_true", help= "switch between coverage of reference and size ratio if coverage is 1 [default=%default]." ) parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action="store_true", help="[default=%default].") parser.add_option( "-m", "--skip-tms", dest="skip_tms", action="store_true", help= "discard domains which contain transmembrane regions [default=%default]." ) parser.add_option("-e", "--check-selection", dest="check_selection", action="store_true", help="[default=%default].") parser.add_option( "-q", "--quality", dest="quality", action="store_true", help="take only sequences which are curated [default=%default].") parser.add_option("--no-full-length", dest="no_full_length", action="store_true", help="[default=%default].") parser.add_option("--only-full-length", dest="only_full_length", action="store_true", help="[default=%default].") parser.add_option( "--check-if-comparable", dest="check_if_comparable", action="store_true", help= "perform comparable check according to Islam95 (default level 85%) [default=%default]." ) parser.add_option("--subset", dest="subset", type="string", help="use only a subset of nids [default=%default]") parser.set_defaults( database="pairsdb", table_name_reference=None, table_name_trees=None, table_name_parts=None, table_name_bench=None, resolution=None, loglevel=1, min_overlap=1, switch=0, combine_repeats=1, skip_repeats=0, skip_tms=0, discard_full_length=0, check_selection=0, selection_threshold=0.9, quality=None, no_full_length=None, only_full_length=None, ## a full length domain should cover at least 90% of a sequence min_length_ratio=0.9, check_comparable=None, check_comparable_level=0.85, bin_size=1, subset=None) (options, args) = E.Start(parser, argv=argv, add_output_options=True) dbhandle = Pairsdb() dbhandle.Connect(dbname=options.database) tbl_reference = TableDomains(dbhandle, "generic") tbl_reference.SetName(options.table_name_reference) # tbl_masks = Table_nrdb90_masks(dbhandle) tbl_nrdb = Table_nrdb(dbhandle) # todo: encapsulate this with a parameter tbl_nrdb.name = "nrdb40" if options.table_name_trees: nids_statement = '''SELECT DISTINCT t.nid FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\ (options.table_name_trees, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT t.node, t.parent, t.level, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = options.table_name_trees elif options.table_name_parts or options.table_name_bench: if options.table_name_parts: table_name = options.table_name_parts else: table_name = options.table_name_bench if options.subset: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS t WHERE t.nid = s.nid''' % (options.subset, table_name) else: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS r %%s WHERE r.nid = s.nid %%s''' %\ (table_name, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT 1, 0, 0, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = table_name else: print "what shall I compare?" sys.exit(1) if options.check_selection: selection_statement = """ SELECT t.domain_from, t.domain_to, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (t.domain_to - t.domain_from)i) AS cov_dom, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref FROM %(selection_tablename)s AS t WHERE t.domain_nid = %(nid)i AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ selection_tablename = options.table_name_parts options.table_name_parts = None parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0 min_overlap = options.min_overlap nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall()) overlaps = [] cov_doms = [] cov_refs = [] touched = {} if options.check_selection: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) else: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) E.info("--> processing %i nids" % len(nids)) nskipped_no_assignments = 0 nskipped_no_overlap = 0 nskipped_wrong_domaintype = 0 nfound = 0 it = 0 for nid in nids: it += 1 E.debug("--> processing %i" % nid) domains = tbl_reference.GetDomainBoundariesForNid(nid) length = tbl_nrdb.GetLength(nid) if not domains: nskipped_no_assignments += 1 continue if options.no_full_length and len(domains) == 1: ## check if domain is actually full length, otherwise keep id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float(length) >= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue if options.only_full_length: if len(domains) == 1: id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float( length) <= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue else: nskipped_wrong_domaintype += 1 continue nfound += 1 last_id = None x = 0 # iteration over domains in reference while x < len(domains): id, domain_from, domain_to = domains[x] ########################################################## # process repeats is_repeat = -1 while x < len(domains) and domains[x][0] == id: domain_to = domains[x][2] x += 1 is_repeat += 1 if options.skip_repeats and is_repeat: continue # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to): # continue ########################################################## ## apply resolution if options.resolution: start = int(float(domain_from - 1) / options.resolution) end = int(float(domain_to - 1) / options.resolution) + 1 else: start = domain_from end = domain_to E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \ ( id, domain_from, domain_to, start, end)) ########################################################## ## get best matching domain s = statement % locals() if options.loglevel >= 4: print s result = dbhandle.Execute(s).fetchone() if not result: continue node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result key = "%i-%s-%i-%i" % (nid, id, start, end) if touched.has_key(key): continue else: touched[key] = 1 # discard full length domains if options.discard_full_length: if options.table_name_trees: if node == 0: continue else: if length == end - start: continue if options.switch and cov_ref == 1.0: xcov_ref = rat_ref else: xcov_ref = cov_ref # check, if selection did take a domain lower or further up if options.check_selection: start = (start * 10) + 1 end = min(end * 10 + 1, length) s = selection_statement % locals() result = dbhandle.Execute(s).fetchone() if result: parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result if rat_parts > 1.0: parts_larger_than_trees += 1 token = ">" elif rat_parts == 1.0: parts_same_as_trees += 1 token = "=" else: parts_smaller_than_trees += 1 token = "<" if rat_parts < options.selection_threshold: parts_much_smaller_than_trees += 1 options.stdout.write( string.join( map(str, (nid, id, domain_from, domain_to, level, yfrom, yto, parts_from, parts_to, overlap, cov_dom, cov_ref, rat_ref, xcov_ref, ovl_parts, cov_parts, cov_tree, rat_parts, token)), "\t") + "\n") else: options.stdout.write( string.join( map(str, (nid, node, parent, level, start, end, id, start, end, overlap, cov_dom, cov_ref, rat_ref, xcov_ref)), "\t") + "\n") overlaps.append(int(overlap * 100)) cov_doms.append(int(cov_dom * 100)) cov_refs.append(int(xcov_ref * 100)) E.info("skipped nids because of no overlap with reference: %i" % nskipped_no_overlap) E.info("skipped nids because of no assignments: %i" % nskipped_no_assignments) E.info("skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype) E.info("nids in comparison: %i" % nfound) if options.check_selection: E.info(" parts larger than trees=", parts_larger_than_trees) E.info(" parts like trees=", parts_same_as_trees) E.info(" parts smaller than trees=", parts_smaller_than_trees) E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees) else: outfile_stats = E.openOutputFile("stats") outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader()) outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps))) outfile_stats.write("domain_coverage\t%s\n" % str(Stats.Summary(cov_doms))) outfile_stats.write("reference_coverage\t%s\n" % str(Stats.Summary(cov_refs))) outfile_stats.close() outfile = E.openOutputFile("overlaps.histogram") outfile.write("bin\tcounts\n") Histogram.Write( outfile, Histogram.Calculate(overlaps, min_value=0, increment=1, no_empty_bins=True)) outfile.close() outfile = E.openOutputFile("domain_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_doms, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() outfile = E.openOutputFile("reference_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_refs, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() E.Stop()