예제 #1
0
            id, acc, len = rx_head.match(record[0]).groups()
        except AttributeError, msg:
            E.warn("parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith("Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match(
                line).groups()

            for c in coordinates.split(" "):
                start, end = [int(x) for x in c.split("-")]
                start -= 1
                options.stdout.write(
                    options.prefix +
                    "\t".join(map(str, (acc, start, end, family))) + "\n")
                ndomains += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, ndomains=%i, nerrors=%i" %
           (ninput, noutput, ndomains, nskipped))

    E.Stop()


if __name__ == "__main__":
    sys.exit(main())
예제 #2
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--format", dest="graph-format", type="choice",
                       choices=("alignments",),
                       help="graph format [default=%default].")

    parser.add_option( "-m", "--method", dest="method", type="choice",
                       choices=("shortest-path", "translate", "components", "add-family" ),
                       help="methods to apply [default=%default].")

    parser.add_option( "-a", "--filename-map", dest="filename_map", type="string",
                       help="filename mapping ids to nids (used for translation) [default=%default].")

    parser.add_option( "-1", "--node1", dest="node1", type="string",
                       help="first node for path calculation [default=%default].")

    parser.add_option( "-2", "--node2", dest="node2", type="string",
                       help="second node for path calculation [default=%default].")

    parser.add_option( "-f", "--filename-families", dest="filename_families", type="string",
                       help="filename with domain families [default=%default].")



    parser.set_defaults( 
        method = None,
        graph_format = "alignments",
        filename_map = None,
        node1 = None,
        node2 = None,
        filename_families = None,
        )

    (options, args) = E.Start( parser, 
                               argv = argv )
            
    if options.filename_families != None:
        E.info( "reading families from %s" % options.filename_families )
        map_domain2family = {}
        for line in open( options.filename_families, "r"):
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            pid = bytes("%s_%s_%s" % (nid,start,end))
            map_domain2family[pid] = bytes(family)
        E.info( "read %i domains" % len(map_domain2family))

    if options.method == "translate":
        
        if options.filename_map:
            E.info("reading map from %s" % options.filename_map)
            map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") )
            map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()])

        def translate_alignments( line ):        
            if line.startswith("passed"): return line
            data = line.split( "\t" )
            
            x = data[1].split("_")
            y = data[2].split("_")
            try:
                data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(x) )
                raise
            try:
                data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(y) )
                raise

            return "\t".join(data)

        if options.graph_format == "alignments":
            translator = translate_alignments
            
        for line in options.stdin:
            if not line.startswith("#"): 
                line = translator( line )
            options.stdout.write(line)
            
        E.Stop()
        return

    elif options.method == "add-family":
        options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields)))
        for link in AddaIO.iterate_tested_links( options.stdin ):
            qfamily = map_domain2family.get(link.qdomain,"na")
            sfamily = map_domain2family.get(link.sdomain,"na")
            options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), 
                                                    qfamily,
                                                    sfamily))
        E.Stop()
        return

    t = time.time()
    if options.graph_format == "alignments":
        map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin )
        
    E.info( "graph read in %i seconds" % (time.time() - t ))
    t = time.time()

    if options.method == "shortest-path":
        E.debug( "shortest path between %s:%i and %s:%i" % \
                     (options.node1,
                      map_vertex2id[options.node1],
                      options.node2,
                      map_vertex2id[options.node2] ) )

        paths = G.get_shortest_paths( map_vertex2id[options.node1],
                                      to = (map_vertex2id[options.node2],)
                                      )
             
        p = paths[map_vertex2id[options.node2]]
        if len(p) == 0: 
            E.info( "no path between %s:%i and %s:%i" % \
                        (options.node1,
                         map_vertex2id[options.node1],
                         options.node2,
                         map_vertex2id[options.node2] ) )

        
        l, last_node = p[0], map_id2vertex[p[0]]
        
        for x in p[1:]:
            node = map_id2vertex[x]
            ei = G.get_eid(x, l)
            
            options.stdout.write( "%s\t%s\t%s\n" %\
                                  (last_node, node, 
                                   G.es[ei]["info"]) ) 
            l, last_node = x, node

    elif options.method == "components":
        print "component\tnode"
        for id, component in enumerate(nx.connected_components( G )):
            for c in component:
                print "%i\t%s" % (id,c)

    E.info( "%s: %i seconds" % (options.method, time.time() - t ))
    E.Stop()
예제 #3
0
def main():
    global L

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--config",
                      dest="filename_config",
                      type="string",
                      help="configuration file [default=%default].")

    parser.add_option("--force",
                      dest="force",
                      action="store_true",
                      help="overwrite existing files [default=%default].")

    parser.add_option(
        "--continue",
        dest="append",
        action="store_true",
        help=
        "continue from an aborted run and append to existing files [default=%default]."
    )

    parser.add_option(
        "--test",
        dest="test",
        type="int",
        help="run a test with first # sequences [default=%default]")

    parser.add_option(
        "--num-jobs",
        dest="num_jobs",
        type="int",
        help=
        "use # processes. If not set, the number of CPUs/cores is taken [default=%default]"
    )

    parser.add_option(
        "--chunks",
        dest="chunks",
        type="string",
        help=
        "work on one or more chunks only. Provide a comma-separated list. [default=%default]"
    )

    parser.add_option("--command",
                      dest="command",
                      type="choice",
                      choices=("sequences", "blast", "fit", "graph", "index",
                               "check-index", "profiles", "segment",
                               "optimise", "convert", "mst", "mst-components",
                               "align", "cluster", "realign", "families",
                               "stats", "summary"),
                      help="perform a command [default=%default]")

    parser.add_option("--start-at",
                      dest="start_at",
                      type="string",
                      help="start at sequence [default=%default]")

    parser.add_option("--stop-at",
                      dest="stop_at",
                      type="string",
                      help="stop at sequenec [default=%default]")

    parser.set_defaults(
        filename_config="adda.ini",
        command=None,
        start_at=None,
        stop_at=None,
        force=False,
        append=False,
        test=None,
        num_jobs=None,
        chunks="all",
    )

    (options, args) = E.Start(parser)

    # setup logging
    if options.loglevel == 0:
        lvl = logging.ERROR
    elif options.loglevel == 1:
        lvl = logging.INFO
    else:
        lvl = logging.DEBUG

    logQueue = multiprocessing.Queue(100)
    handler = Logger.MultiProcessingLogHandler(
        logging.FileHandler("adda.log", "a"), logQueue)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s',
            datefmt='%m-%d %H:%M'))
    logging.getLogger('adda').addHandler(handler)
    logging.getLogger('adda').setLevel(lvl)

    E.setLogger(logging.getLogger("adda"))
    L = logging.getLogger("adda")

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    if len(args) == 0:
        if not options.command:
            raise ValueError("specify at least one command")
    elif len(args) == 1:
        options.command = args[0]
    else:
        raise ValueError("one command line argument is sufficient.")

    ## collect modules and initialise them
    map_module = {
        'fit': AddaFit.AddaFit,
        'segment': AddaSegment.AddaSegment,
        'blast': AddaBlast.AddaBlast,
        'graph': AddaGraph.AddaGraph,
        'stats': AddaStats.AddaStats,
        'profiles': AddaProfiles.AddaProfiles,
        'realign': AddaAlign.AddaRealign,
        'index': AddaIndex.AddaIndexBuild,
        'check-index': AddaIndex.AddaIndexCheck,
        'optimise': AddaOptimise.AddaOptimise,
        'sequences': AddaSequences.AddaSequences,
        'convert': AddaConvert.AddaConvert,
        'mst': AddaMst.AddaMst,
        'mst-components': AddaComponentsMst.AddaComponentsMst,
        'align': AddaAlign.AddaAlign,
        'cluster': AddaCluster.AddaCluster,
        'families': AddaFamilies.AddaFamilies,
        'summary': AddaSummary.AddaSummary,
    }

    try:
        fasta = IndexedFasta.IndexedFasta(
            config.get("files", "output_fasta", "adda"))
    except KeyError:
        fasta = None

    if options.num_jobs == 1:
        run_parallel = runSequentially
    else:
        run_parallel = runParallel

    kwargs = {
        "loglevel": options.loglevel,
        "append": options.append,
        "force": options.force
    }

    if options.command == "index":
        module = map_module[options.command](config, fasta=fasta, **kwargs)
        if module.isComplete():
            E.info("output of command `%s` present and complete" %
                   options.command)
        else:
            filename_graph = config.get("files", "input_graph",
                                        "pairsdb_40x40.links.gz")
            if "," in filename_graph:
                filename_graph = filename_graph.split(",")
                # permit parallel processing of multiple files
                run_parallel(
                    run_on_files,
                    filename=filename_graph,
                    options=options,
                    module=map_module[options.command],
                    config=config,
                    kwargs=kwargs,
                )

                nchunks = len(filename_graph)

                module = map_module[options.command](config,
                                                     chunk=0,
                                                     num_chunks=nchunks,
                                                     **kwargs)

                if not module.isComplete():
                    L.info("merging")

                    if not module.merge():
                        raise ValueError("error while merging for `%s`" %
                                         options.command)

            else:
                # process single file - no hazzle.
                module.startUp()
                module.run()
                module.finish()

    if options.command in ("sequences", "stats", "optimise", "convert", "mst",
                           "mst-components", "cluster", "families", "summary"):
        module = map_module[options.command](config, fasta=fasta, **kwargs)
        if module.isComplete():
            E.info("output of command `%s` present and complete" %
                   options.command)
        else:
            module.startUp()
            module.run()
            module.finish()

    elif options.command in ("fit", "segment"):

        run_on_graph = RunOnGraph(config, options.command)

        run_parallel(run_on_graph,
                     filename=config.get("files", "input_graph", "adda.graph"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        if not merge(options,
                     module=map_module[options.command],
                     config=config,
                     fasta=fasta):
            E.Stop()
            return

    elif options.command in ("align"):

        run_parallel(run_on_file,
                     filename=config.get("files", "output_mst", "adda.mst"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        merge(options,
              module=map_module[options.command],
              config=config,
              fasta=fasta)

    elif options.command in ("realign"):

        run_parallel(run_on_file,
                     filename=config.get("files", "output_align",
                                         "adda.align"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        merge(options,
              module=map_module[options.command],
              config=config,
              fasta=fasta)
예제 #4
0
            E.warn(
                "incomplete line? Type error in conversion: '%s' with data: %s"
                % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break

    if len(rows) == 0:
        if not options.allow_empty or not reader.fieldnames:
            raise ValueError("empty table")
        else:
            # create empty table and exit
            take, map_column2type, ignored = createTable(
                dbhandle, error, headers=reader.fieldnames, options=options)
            E.info("empty table created")
            E.Stop()
            return
    else:
        take, map_column2type, ignored = createTable(dbhandle,
                                                     error,
                                                     rows=rows,
                                                     options=options)

    E.info("read %i rows for type guessing" % len(rows))

    def row_iter(rows, reader):
        for row in rows:
            yield quoteRow(row,
                           take,
                           map_column2type,
                           options.missing_values,
예제 #5
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
def main(argv=sys.argv):

    parser = optparse.OptionParser(version="%prog version: $Id$",
                                   usage=globals()["__doc__"])

    parser.add_option("-D",
                      "--database",
                      dest="database",
                      type="string",
                      help="tablename to use [default=%default].")

    parser.add_option("-t",
                      "--trees",
                      dest="table_name_trees",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option("-r",
                      "--parts",
                      dest="table_name_parts",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option(
        "-b",
        "--bench",
        dest="table_name_bench",
        type="string",
        help=
        "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]."
    )

    parser.add_option(
        "-f",
        "--reference",
        dest="table_name_reference",
        type="string",
        help=
        "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]."
    )

    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size [default=%default].")

    parser.add_option(
        "-o",
        "--resolution",
        dest="resolution",
        type="float",
        help="resolution for scaling of domains [default=%default].")

    parser.add_option(
        "-s",
        "--switch",
        dest="switch",
        action="store_true",
        help=
        "switch between coverage of reference and size ratio if coverage is 1 [default=%default]."
    )

    parser.add_option("-k",
                      "--skip-repeats",
                      dest="skip_repeats",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-m",
        "--skip-tms",
        dest="skip_tms",
        action="store_true",
        help=
        "discard domains which contain transmembrane regions [default=%default]."
    )

    parser.add_option("-e",
                      "--check-selection",
                      dest="check_selection",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-q",
        "--quality",
        dest="quality",
        action="store_true",
        help="take only sequences which are curated [default=%default].")

    parser.add_option("--no-full-length",
                      dest="no_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option("--only-full-length",
                      dest="only_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "--check-if-comparable",
        dest="check_if_comparable",
        action="store_true",
        help=
        "perform comparable check according to Islam95 (default level 85%) [default=%default]."
    )

    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="use only a subset of nids [default=%default]")

    parser.set_defaults(
        database="pairsdb",
        table_name_reference=None,
        table_name_trees=None,
        table_name_parts=None,
        table_name_bench=None,
        resolution=None,
        loglevel=1,
        min_overlap=1,
        switch=0,
        combine_repeats=1,
        skip_repeats=0,
        skip_tms=0,
        discard_full_length=0,
        check_selection=0,
        selection_threshold=0.9,
        quality=None,
        no_full_length=None,
        only_full_length=None,
        ## a full length domain should cover at least 90% of a sequence
        min_length_ratio=0.9,
        check_comparable=None,
        check_comparable_level=0.85,
        bin_size=1,
        subset=None)

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    dbhandle = Pairsdb()
    dbhandle.Connect(dbname=options.database)

    tbl_reference = TableDomains(dbhandle, "generic")
    tbl_reference.SetName(options.table_name_reference)

    # tbl_masks = Table_nrdb90_masks(dbhandle)
    tbl_nrdb = Table_nrdb(dbhandle)

    # todo: encapsulate this with a parameter
    tbl_nrdb.name = "nrdb40"

    if options.table_name_trees:

        nids_statement = '''SELECT DISTINCT t.nid 
                            FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\
                         (options.table_name_trees,
                          options.table_name_reference)

        if options.quality:
            nids_statement = nids_statement % (
                ", nrdb_quality AS q",
                "AND q.nid = s.nid AND q.is_curated = 'T'")
        else:
            nids_statement = nids_statement % ("", "")

        statement = """
        SELECT t.node, t.parent, t.level, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = options.table_name_trees

    elif options.table_name_parts or options.table_name_bench:

        if options.table_name_parts:
            table_name = options.table_name_parts
        else:
            table_name = options.table_name_bench

        if options.subset:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, %s AS t 
                                WHERE t.nid = s.nid''' % (options.subset,
                                                          table_name)
        else:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, 
                                     %s AS r %%s 
                                 WHERE r.nid = s.nid %%s''' %\
                             (table_name, options.table_name_reference)

            if options.quality:
                nids_statement = nids_statement % (
                    ", nrdb_quality AS q",
                    "AND q.nid = s.nid AND q.is_curated = 'T'")
            else:
                nids_statement = nids_statement % ("", "")

        statement = """
        SELECT 1, 0, 0, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = table_name

    else:
        print "what shall I compare?"
        sys.exit(1)

    if options.check_selection:
        selection_statement = """
        SELECT t.domain_from, t.domain_to,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (t.domain_to - t.domain_from)i) AS cov_dom,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (%(end)i - %(start)i)) AS cov_ref,
        ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(selection_tablename)s AS t
        WHERE t.domain_nid = %(nid)i
        AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """
        selection_tablename = options.table_name_parts

        options.table_name_parts = None

        parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0

    min_overlap = options.min_overlap

    nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall())

    overlaps = []
    cov_doms = []
    cov_refs = []
    touched = {}

    if options.check_selection:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )
    else:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )

    E.info("--> processing %i nids" % len(nids))

    nskipped_no_assignments = 0
    nskipped_no_overlap = 0
    nskipped_wrong_domaintype = 0
    nfound = 0

    it = 0
    for nid in nids:

        it += 1

        E.debug("--> processing %i" % nid)

        domains = tbl_reference.GetDomainBoundariesForNid(nid)

        length = tbl_nrdb.GetLength(nid)

        if not domains:
            nskipped_no_assignments += 1
            continue

        if options.no_full_length and len(domains) == 1:
            ## check if domain is actually full length, otherwise keep
            id, domain_from, domain_to = domains[0]
            if float(domain_to -
                     domain_from) / float(length) >= options.min_length_ratio:
                nskipped_wrong_domaintype += 1
                continue

        if options.only_full_length:
            if len(domains) == 1:
                id, domain_from, domain_to = domains[0]
                if float(domain_to - domain_from) / float(
                        length) <= options.min_length_ratio:
                    nskipped_wrong_domaintype += 1
                    continue
            else:
                nskipped_wrong_domaintype += 1
                continue

        nfound += 1

        last_id = None
        x = 0

        # iteration over domains in reference
        while x < len(domains):

            id, domain_from, domain_to = domains[x]

            ##########################################################
            # process repeats
            is_repeat = -1

            while x < len(domains) and domains[x][0] == id:
                domain_to = domains[x][2]
                x += 1
                is_repeat += 1

            if options.skip_repeats and is_repeat:
                continue

            # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to):
            #    continue

            ##########################################################
            ## apply resolution
            if options.resolution:
                start = int(float(domain_from - 1) / options.resolution)
                end = int(float(domain_to - 1) / options.resolution) + 1
            else:
                start = domain_from
                end = domain_to

            E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \
                         ( id, domain_from, domain_to, start, end))

            ##########################################################
            ## get best matching domain
            s = statement % locals()

            if options.loglevel >= 4: print s

            result = dbhandle.Execute(s).fetchone()

            if not result: continue

            node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result

            key = "%i-%s-%i-%i" % (nid, id, start, end)
            if touched.has_key(key):
                continue
            else:
                touched[key] = 1

            # discard full length domains
            if options.discard_full_length:
                if options.table_name_trees:
                    if node == 0: continue
                else:
                    if length == end - start: continue

            if options.switch and cov_ref == 1.0:
                xcov_ref = rat_ref
            else:
                xcov_ref = cov_ref

            # check, if selection did take a domain lower or further up
            if options.check_selection:
                start = (start * 10) + 1
                end = min(end * 10 + 1, length)

                s = selection_statement % locals()
                result = dbhandle.Execute(s).fetchone()

                if result:
                    parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result

                    if rat_parts > 1.0:
                        parts_larger_than_trees += 1
                        token = ">"
                    elif rat_parts == 1.0:
                        parts_same_as_trees += 1
                        token = "="
                    else:
                        parts_smaller_than_trees += 1
                        token = "<"
                        if rat_parts < options.selection_threshold:
                            parts_much_smaller_than_trees += 1

                    options.stdout.write(
                        string.join(
                            map(str, (nid, id, domain_from, domain_to, level,
                                      yfrom, yto, parts_from, parts_to,
                                      overlap, cov_dom, cov_ref, rat_ref,
                                      xcov_ref, ovl_parts, cov_parts, cov_tree,
                                      rat_parts, token)), "\t") + "\n")

            else:
                options.stdout.write(
                    string.join(
                        map(str, (nid, node, parent, level, start, end, id,
                                  start, end, overlap, cov_dom, cov_ref,
                                  rat_ref, xcov_ref)), "\t") + "\n")

                overlaps.append(int(overlap * 100))
                cov_doms.append(int(cov_dom * 100))
                cov_refs.append(int(xcov_ref * 100))

    E.info("skipped nids because of no overlap with reference: %i" %
           nskipped_no_overlap)
    E.info("skipped nids because of no assignments: %i" %
           nskipped_no_assignments)
    E.info("skipped nids because of wrong domain type: %i" %
           nskipped_wrong_domaintype)
    E.info("nids in comparison: %i" % nfound)

    if options.check_selection:
        E.info(" parts larger than trees=", parts_larger_than_trees)
        E.info(" parts like trees=", parts_same_as_trees)
        E.info(" parts smaller than trees=", parts_smaller_than_trees)
        E.info(
            " parts much smaller than trees (<%f)=" %
            options.selection_threshold, parts_much_smaller_than_trees)
    else:
        outfile_stats = E.openOutputFile("stats")
        outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader())
        outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps)))
        outfile_stats.write("domain_coverage\t%s\n" %
                            str(Stats.Summary(cov_doms)))
        outfile_stats.write("reference_coverage\t%s\n" %
                            str(Stats.Summary(cov_refs)))
        outfile_stats.close()

        outfile = E.openOutputFile("overlaps.histogram")
        outfile.write("bin\tcounts\n")
        Histogram.Write(
            outfile,
            Histogram.Calculate(overlaps,
                                min_value=0,
                                increment=1,
                                no_empty_bins=True))
        outfile.close()

        outfile = E.openOutputFile("domain_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_doms,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))
        outfile.close()

        outfile = E.openOutputFile("reference_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_refs,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))

        outfile.close()

    E.Stop()