예제 #1
0
 def _buildProfile(nid, start, end):
     neighbours = index.getNeighbours(nid)
     mali = align.buildMali(nid, neighbours)
     prof = alignlib.makeProfile(mali)
     E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
     prof.useSegment(start, end)
     prof.prepare()
     seq = fasta.getSequence(nid)
     return alignlib.makeSequence(seq), prof
예제 #2
0
 def _buildProfile(nid, start, end):
     neighbours = index.getNeighbours(nid)
     mali = align.buildMali(nid, neighbours)
     prof = alignlib.makeProfile(mali)
     E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
     prof.useSegment(start, end)
     prof.prepare()
     seq = fasta.getSequence(nid)
     return alignlib.makeSequence(seq), prof
예제 #3
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--no-swissprot-version",
                      dest="no_swissprot_version",
                      action="store_true",
                      help="remove swissprot version information [%default]")

    parser.add_option("--no-pfam-version",
                      dest="no_pfam_version",
                      action="store_true",
                      help="remove pfam version information [%default]")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="add prefix to id [%default]")

    parser.set_defaults(no_swissprot_version=False,
                        no_pfam_version=False,
                        prefix="")

    (options, args) = E.Start(parser)

    rx_head = re.compile(">(\S+)\s+\S+\| (\S+) (\d+) a.a.")
    rx_domain = re.compile("(\S+) .* (PF\d+.\d+) (.*)  (.*)")
    options.stdout.write("nid\tstart\tend\tfamily\n")

    ninput, noutput, ndomains, nskipped = 0, 0, 0, 0
    for record in record_iterator(sys.stdin):
        ninput += 1
        try:
            id, acc, len = rx_head.match(record[0]).groups()
        except AttributeError, msg:
            E.warn("parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith("Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match(
                line).groups()

            for c in coordinates.split(" "):
                start, end = [int(x) for x in c.split("-")]
                start -= 1
                options.stdout.write(
                    options.prefix +
                    "\t".join(map(str, (acc, start, end, family))) + "\n")
                ndomains += 1
            noutput += 1
예제 #4
0
def main():
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE )

    parser.add_option( "--no-swissprot-version", dest="no_swissprot_version", action="store_true",
                       help="remove swissprot version information [%default]" )

    parser.add_option( "--no-pfam-version", dest="no_pfam_version", action="store_true",
                       help="remove pfam version information [%default]" )

    parser.add_option( "--prefix", dest="prefix", type="string",
                       help="add prefix to id [%default]" )

    parser.set_defaults( 
        no_swissprot_version = False,
        no_pfam_version = False,
        prefix = ""
        )

    (options,args) = E.Start( parser )

    rx_head = re.compile( ">(\S+)\s+\S+\| (\S+) (\d+) a.a.")
    rx_domain = re.compile( "(\S+) .* (PF\d+.\d+) (.*)  (.*)")
    options.stdout.write( "nid\tstart\tend\tfamily\n")
    
    ninput, noutput, ndomains, nskipped = 0,0,0,0
    for record in record_iterator( sys.stdin ):
        ninput += 1
        try:
            id, acc, len = rx_head.match( record[0] ).groups()
        except AttributeError, msg:
            E.warn( "parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith( "Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match( line ).groups()
                
            for c in coordinates.split( " "):
                start,end = [ int(x) for x in c.split("-") ]
                start -= 1
                options.stdout.write( options.prefix + "\t".join( map(str, (acc, start, end, family) ) ) + "\n" )
                ndomains += 1
            noutput += 1
예제 #5
0
def readAlignmentGraph( infile ):

    result = [ x[:-1].split("\t") for x in infile.readlines() if not x.startswith("#") or not x.startswith( "passed") ]
    E.info( "collected %i edges" % len(result) )    

    # collect vertices
    vertices = set( [x[1] for x in result ] )
    vertices.update( [x[2] for x in result ] )
    vertices = list( vertices )
    map_vertex2id = dict( [ (x[1],x[0]) for x in enumerate(vertices) ] )

    E.info( "collected %i vertices" % len(vertices ) )

    G = igraph.Graph( len(vertices) )
    G.add_edges( [ (map_vertex2id[x[1]], \
                    map_vertex2id[x[2]]) for x in result ] )

    G.es[ "info" ] = [ "%s\t%s\t%s\t%s\t%s" % \
                           (x[0], x[10], x[11],
                            x[16], x[17] ) for x in result ]

    return map_vertex2id, vertices, G
예제 #6
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--format", dest="graph-format", type="choice",
                       choices=("alignments",),
                       help="graph format [default=%default].")

    parser.add_option( "-m", "--method", dest="method", type="choice",
                       choices=("shortest-path", "translate", "components", "add-family" ),
                       help="methods to apply [default=%default].")

    parser.add_option( "-a", "--filename-map", dest="filename_map", type="string",
                       help="filename mapping ids to nids (used for translation) [default=%default].")

    parser.add_option( "-1", "--node1", dest="node1", type="string",
                       help="first node for path calculation [default=%default].")

    parser.add_option( "-2", "--node2", dest="node2", type="string",
                       help="second node for path calculation [default=%default].")

    parser.add_option( "-f", "--filename-families", dest="filename_families", type="string",
                       help="filename with domain families [default=%default].")



    parser.set_defaults( 
        method = None,
        graph_format = "alignments",
        filename_map = None,
        node1 = None,
        node2 = None,
        filename_families = None,
        )

    (options, args) = E.Start( parser, 
                               argv = argv )
            
    if options.filename_families != None:
        E.info( "reading families from %s" % options.filename_families )
        map_domain2family = {}
        for line in open( options.filename_families, "r"):
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            pid = bytes("%s_%s_%s" % (nid,start,end))
            map_domain2family[pid] = bytes(family)
        E.info( "read %i domains" % len(map_domain2family))

    if options.method == "translate":
        
        if options.filename_map:
            E.info("reading map from %s" % options.filename_map)
            map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") )
            map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()])

        def translate_alignments( line ):        
            if line.startswith("passed"): return line
            data = line.split( "\t" )
            
            x = data[1].split("_")
            y = data[2].split("_")
            try:
                data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(x) )
                raise
            try:
                data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(y) )
                raise

            return "\t".join(data)

        if options.graph_format == "alignments":
            translator = translate_alignments
            
        for line in options.stdin:
            if not line.startswith("#"): 
                line = translator( line )
            options.stdout.write(line)
            
        E.Stop()
        return

    elif options.method == "add-family":
        options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields)))
        for link in AddaIO.iterate_tested_links( options.stdin ):
            qfamily = map_domain2family.get(link.qdomain,"na")
            sfamily = map_domain2family.get(link.sdomain,"na")
            options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), 
                                                    qfamily,
                                                    sfamily))
        E.Stop()
        return

    t = time.time()
    if options.graph_format == "alignments":
        map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin )
        
    E.info( "graph read in %i seconds" % (time.time() - t ))
    t = time.time()

    if options.method == "shortest-path":
        E.debug( "shortest path between %s:%i and %s:%i" % \
                     (options.node1,
                      map_vertex2id[options.node1],
                      options.node2,
                      map_vertex2id[options.node2] ) )

        paths = G.get_shortest_paths( map_vertex2id[options.node1],
                                      to = (map_vertex2id[options.node2],)
                                      )
             
        p = paths[map_vertex2id[options.node2]]
        if len(p) == 0: 
            E.info( "no path between %s:%i and %s:%i" % \
                        (options.node1,
                         map_vertex2id[options.node1],
                         options.node2,
                         map_vertex2id[options.node2] ) )

        
        l, last_node = p[0], map_id2vertex[p[0]]
        
        for x in p[1:]:
            node = map_id2vertex[x]
            ei = G.get_eid(x, l)
            
            options.stdout.write( "%s\t%s\t%s\n" %\
                                  (last_node, node, 
                                   G.es[ei]["info"]) ) 
            l, last_node = x, node

    elif options.method == "components":
        print "component\tnode"
        for id, component in enumerate(nx.connected_components( G )):
            for c in component:
                print "%i\t%s" % (id,c)

    E.info( "%s: %i seconds" % (options.method, time.time() - t ))
    E.Stop()
예제 #7
0
def annotateAlignmentGraph( infile, outfiles ):
    '''input the alignment graph and output
    a translated version of it and adding 
    reference domain information.
    '''

    outfile, outfile_stats = outfiles

    # collect benchmark domains 
    E.info( "reading benchmark domains" )
    benchmark_domains = AddaIO.readMapNid2Domains( 
        gzip.open( PARAMS["eval_filename_benchmark_domains"] ) )

    totuple = AddaIO.toTuple
    toDomain = AddaIO.toDomain
    # build map of id to nid
    E.info( "reading map between pid and nid" )
    map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r") )

    def getOverlappingDomains( pid, start, end ):
        '''get domains overlapping pid:start..end'''
        if pid not in benchmark_domains: return ()
        # greedy overlap testing
        r = []
        for family, domains in benchmark_domains[pid].iteritems():
            for other_start, other_end in domains:
                if start >= other_end or end <= other_start: continue
                r.append( (family, other_start, other_end) )
        return r

    counts = E.Counter()
    
    if infile.endswith(".gz"):
        inf = gzip.open( infile )
    else:
        inf = open(infile)

    outf = gzip.open( outfile, "w" )
    
    outf.write( "%s\n" % "\t".join( ( "passed",
                                      "qdomain",
                                      "sdomain",
                                      "weight",
                                      "qstart",
                                      "qend",
                                      "qali",
                                      "sstart",
                                      "send",
                                      "sali",
                                      "score",
                                      "naligned",
                                      "ngaps",
                                      "zscore",
                                      "rfamilies",
                                      "sfamilies",
                                      "rdomains",
                                      "sdomains")) )

    
    # counts for true positives, false positives and unknown
    n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0
            
    outf_stats = open( outfile_stats, "w" )
    outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n" )
    last_weight = None

    for link in AddaIO.iterate_tested_links( inf ):
        qnid, qstart, qend = totuple(link.qdomain)
        snid, sstart, send = totuple(link.sdomain)
        qpid = map_nid2pid[qnid]
        spid = map_nid2pid[snid]
        qfamily = sorted(getOverlappingDomains( qpid, qstart, qend ))
        sfamily = sorted(getOverlappingDomains( spid, sstart, send ))

        passed = link.passed == "+"
        n += 1

        if not qfamily and not sfamily:
            uk += 1
        else:
            qf = set( [x[0] for x in qfamily] )
            sf = set( [x[0] for x in sfamily] )
            if qf.intersection( sf ):
                if passed: tp += 1
                else: fn += 1
            else:
                if passed: fp += 1
                else: tn += 1
        
        weight = round(float(link.weight))
        if weight != last_weight:
            if last_weight != None:
                outf_stats.write( "\t".join( map(str, (last_weight,
                                                       n,
                                                       tp, fp, fn, tn, uk,
                                                       float(tp) / (tp+fp),
                                                       float(fn) / (fn+tn+0.00001),
                                                       ) ) ) + "\n" )
                                                   
            last_weight = weight

        if passed: counts.passed += 1
        else: counts.failed += 1

        link = link._replace( qdomain=toDomain( (qpid, qstart, qend) ),
                              sdomain=toDomain( (spid, sstart, send) ))

        outf.write( "%s\t%s\t%s\t%s\t%s\n" % \
                        ("\t".join( map(str,link) ), 
                         ",".join( sorted(set([x[0] for x in qfamily])) ),
                         ",".join( sorted(set([x[0] for x in sfamily])) ),
                         ",".join("%s_%i_%i" % x for x in qfamily ),
                         ",".join("%s_%i_%i" % x for x in sfamily )))
    inf.close()
    outf_stats.write( "\t".join( map(str, (last_weight,
                                           n,
                                           tp, fp, fn, tn, uk,
                                           float(tp) / (tp+fp),
                                           float(fn) / (fn+tn) ) ) ) + "\n" )
    
    outf_stats.close()
    E.info( "%s" % str( counts ) )
예제 #8
0
                      dest="format",
                      type="choice",
                      choices=("graph", "nodelist"),
                      help="input format [default=%default]")

    parser.set_defaults(
        multi_labels=None,
        legend=None,
        label1=3,
        label2=4,
        attributes=[],
        format="graph",
        label="info1",
    )

    (options, args) = E.Start(parser)

    take = (0, 1, options.label1 - 1, options.label2 - 1)

    if len(options.attributes) == 0:
        raise ValueError("please provide at least one attribute")

    options.stdout.write("node\t%s\t%s\n" %
                         ("\t".join(options.attributes), options.label))

    # build attributes
    attributes, default = [], []
    for attribute in options.attributes:
        if attribute in ("colour", "color"):
            attributes.append(colors)
            default.append("white")
예제 #9
0
def main():
    global L

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--config",
                      dest="filename_config",
                      type="string",
                      help="configuration file [default=%default].")

    parser.add_option("--force",
                      dest="force",
                      action="store_true",
                      help="overwrite existing files [default=%default].")

    parser.add_option(
        "--continue",
        dest="append",
        action="store_true",
        help=
        "continue from an aborted run and append to existing files [default=%default]."
    )

    parser.add_option(
        "--test",
        dest="test",
        type="int",
        help="run a test with first # sequences [default=%default]")

    parser.add_option(
        "--num-jobs",
        dest="num_jobs",
        type="int",
        help=
        "use # processes. If not set, the number of CPUs/cores is taken [default=%default]"
    )

    parser.add_option(
        "--chunks",
        dest="chunks",
        type="string",
        help=
        "work on one or more chunks only. Provide a comma-separated list. [default=%default]"
    )

    parser.add_option("--command",
                      dest="command",
                      type="choice",
                      choices=("sequences", "blast", "fit", "graph", "index",
                               "check-index", "profiles", "segment",
                               "optimise", "convert", "mst", "mst-components",
                               "align", "cluster", "realign", "families",
                               "stats", "summary"),
                      help="perform a command [default=%default]")

    parser.add_option("--start-at",
                      dest="start_at",
                      type="string",
                      help="start at sequence [default=%default]")

    parser.add_option("--stop-at",
                      dest="stop_at",
                      type="string",
                      help="stop at sequenec [default=%default]")

    parser.set_defaults(
        filename_config="adda.ini",
        command=None,
        start_at=None,
        stop_at=None,
        force=False,
        append=False,
        test=None,
        num_jobs=None,
        chunks="all",
    )

    (options, args) = E.Start(parser)

    # setup logging
    if options.loglevel == 0:
        lvl = logging.ERROR
    elif options.loglevel == 1:
        lvl = logging.INFO
    else:
        lvl = logging.DEBUG

    logQueue = multiprocessing.Queue(100)
    handler = Logger.MultiProcessingLogHandler(
        logging.FileHandler("adda.log", "a"), logQueue)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s',
            datefmt='%m-%d %H:%M'))
    logging.getLogger('adda').addHandler(handler)
    logging.getLogger('adda').setLevel(lvl)

    E.setLogger(logging.getLogger("adda"))
    L = logging.getLogger("adda")

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    if len(args) == 0:
        if not options.command:
            raise ValueError("specify at least one command")
    elif len(args) == 1:
        options.command = args[0]
    else:
        raise ValueError("one command line argument is sufficient.")

    ## collect modules and initialise them
    map_module = {
        'fit': AddaFit.AddaFit,
        'segment': AddaSegment.AddaSegment,
        'blast': AddaBlast.AddaBlast,
        'graph': AddaGraph.AddaGraph,
        'stats': AddaStats.AddaStats,
        'profiles': AddaProfiles.AddaProfiles,
        'realign': AddaAlign.AddaRealign,
        'index': AddaIndex.AddaIndexBuild,
        'check-index': AddaIndex.AddaIndexCheck,
        'optimise': AddaOptimise.AddaOptimise,
        'sequences': AddaSequences.AddaSequences,
        'convert': AddaConvert.AddaConvert,
        'mst': AddaMst.AddaMst,
        'mst-components': AddaComponentsMst.AddaComponentsMst,
        'align': AddaAlign.AddaAlign,
        'cluster': AddaCluster.AddaCluster,
        'families': AddaFamilies.AddaFamilies,
        'summary': AddaSummary.AddaSummary,
    }

    try:
        fasta = IndexedFasta.IndexedFasta(
            config.get("files", "output_fasta", "adda"))
    except KeyError:
        fasta = None

    if options.num_jobs == 1:
        run_parallel = runSequentially
    else:
        run_parallel = runParallel

    kwargs = {
        "loglevel": options.loglevel,
        "append": options.append,
        "force": options.force
    }

    if options.command == "index":
        module = map_module[options.command](config, fasta=fasta, **kwargs)
        if module.isComplete():
            E.info("output of command `%s` present and complete" %
                   options.command)
        else:
            filename_graph = config.get("files", "input_graph",
                                        "pairsdb_40x40.links.gz")
            if "," in filename_graph:
                filename_graph = filename_graph.split(",")
                # permit parallel processing of multiple files
                run_parallel(
                    run_on_files,
                    filename=filename_graph,
                    options=options,
                    module=map_module[options.command],
                    config=config,
                    kwargs=kwargs,
                )

                nchunks = len(filename_graph)

                module = map_module[options.command](config,
                                                     chunk=0,
                                                     num_chunks=nchunks,
                                                     **kwargs)

                if not module.isComplete():
                    L.info("merging")

                    if not module.merge():
                        raise ValueError("error while merging for `%s`" %
                                         options.command)

            else:
                # process single file - no hazzle.
                module.startUp()
                module.run()
                module.finish()

    if options.command in ("sequences", "stats", "optimise", "convert", "mst",
                           "mst-components", "cluster", "families", "summary"):
        module = map_module[options.command](config, fasta=fasta, **kwargs)
        if module.isComplete():
            E.info("output of command `%s` present and complete" %
                   options.command)
        else:
            module.startUp()
            module.run()
            module.finish()

    elif options.command in ("fit", "segment"):

        run_on_graph = RunOnGraph(config, options.command)

        run_parallel(run_on_graph,
                     filename=config.get("files", "input_graph", "adda.graph"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        if not merge(options,
                     module=map_module[options.command],
                     config=config,
                     fasta=fasta):
            E.Stop()
            return

    elif options.command in ("align"):

        run_parallel(run_on_file,
                     filename=config.get("files", "output_mst", "adda.mst"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        merge(options,
              module=map_module[options.command],
              config=config,
              fasta=fasta)

    elif options.command in ("realign"):

        run_parallel(run_on_file,
                     filename=config.get("files", "output_align",
                                         "adda.align"),
                     options=options,
                     module=map_module[options.command],
                     config=config,
                     kwargs=kwargs)

        merge(options,
              module=map_module[options.command],
              config=config,
              fasta=fasta)
예제 #10
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
예제 #11
0
        weights=False,
        edge_colour=None,
        default_edge_colour="18",
        filename_format_graph=None,
        filename_format_edge=None,
        filename_format_node=None,
        filename_format_bipartite=None,
        titles=True,
        edge_labels=True,
        column_edge_weight=3,
        column_edge_colour=3,
        weight_range="auto",
        add_edge_labels=False,
    )

    (options, args) = E.Start(parser)

    options.column_edge_weight -= 1
    options.column_edge_colour -= 1

    components = None
    ## read components
    if options.filename_components:
        lines = open(options.filename_components, "r").readlines()
        components = {}
        for line in lines:
            id, cid = string.split(line[:-1], "\t")
            components[id] = cid

    if options.filename_subset:
        lines = open(options.filename_subset, "r").readlines()
예제 #12
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("view", "align", "pileup", "profile"),
        help="method to perform [default=%default].",
    )

    parser.add_option(
        "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]."
    )

    parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].")

    parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write(
            "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n"
            % (
                nid1,
                nid2,
                result.getScore(),
                result.getLength(),
                result.getNumGaps(),
                result.getRowFrom(),
                result.getRowTo(),
                result.getColFrom(),
                result.getColTo(),
            )
        )

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
예제 #13
0
        cc = dbhandle.cursor()
        cc.execute("DROP TABLE %s" % options.tablename)
        cc.close()
        if options.loglevel >= 1:
            options.stdlog.write("# existing table %s deleted\n" %
                                 options.tablename)
    except error, msg:
        dbhandle.rollback()
    except error, msg:
        pass

    ## create new table
    statement = "CREATE TABLE %s ( %s );" % (options.tablename,
                                             ", ".join(columns))

    E.debug("table create:\n# %s" % (statement))

    try:
        cc = dbhandle.cursor()
        cc.execute(statement)
        cc.close()
    except error, msg:
        options.stderr.write("table creation failed: statement=\n  %s\n" %
                             (statement))
        raise error, msg

    E.info("table %s created successfully." % options.tablename)

    return take, map_column2type, ignored

예제 #14
0
def annotateAlignmentGraph(infile, outfiles):
    '''input the alignment graph and output
    a translated version of it and adding 
    reference domain information.
    '''

    outfile, outfile_stats = outfiles

    # collect benchmark domains
    E.info("reading benchmark domains")
    benchmark_domains = AddaIO.readMapNid2Domains(
        gzip.open(PARAMS["eval_filename_benchmark_domains"]))

    totuple = AddaIO.toTuple
    toDomain = AddaIO.toDomain
    # build map of id to nid
    E.info("reading map between pid and nid")
    map_nid2pid = AddaIO.readMapPid2Nid(
        open(PARAMS["eval_filename_adda_nids"], "r"))

    def getOverlappingDomains(pid, start, end):
        '''get domains overlapping pid:start..end'''
        if pid not in benchmark_domains: return ()
        # greedy overlap testing
        r = []
        for family, domains in benchmark_domains[pid].iteritems():
            for other_start, other_end in domains:
                if start >= other_end or end <= other_start: continue
                r.append((family, other_start, other_end))
        return r

    counts = E.Counter()

    if infile.endswith(".gz"):
        inf = gzip.open(infile)
    else:
        inf = open(infile)

    outf = gzip.open(outfile, "w")

    outf.write("%s\n" % "\t".join(
        ("passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali",
         "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore",
         "rfamilies", "sfamilies", "rdomains", "sdomains")))

    # counts for true positives, false positives and unknown
    n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0

    outf_stats = open(outfile_stats, "w")
    outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n")
    last_weight = None

    for link in AddaIO.iterate_tested_links(inf):
        qnid, qstart, qend = totuple(link.qdomain)
        snid, sstart, send = totuple(link.sdomain)
        qpid = map_nid2pid[qnid]
        spid = map_nid2pid[snid]
        qfamily = sorted(getOverlappingDomains(qpid, qstart, qend))
        sfamily = sorted(getOverlappingDomains(spid, sstart, send))

        passed = link.passed == "+"
        n += 1

        if not qfamily and not sfamily:
            uk += 1
        else:
            qf = set([x[0] for x in qfamily])
            sf = set([x[0] for x in sfamily])
            if qf.intersection(sf):
                if passed: tp += 1
                else: fn += 1
            else:
                if passed: fp += 1
                else: tn += 1

        weight = round(float(link.weight))
        if weight != last_weight:
            if last_weight != None:
                outf_stats.write("\t".join(
                    map(str, (
                        last_weight,
                        n,
                        tp,
                        fp,
                        fn,
                        tn,
                        uk,
                        float(tp) / (tp + fp),
                        float(fn) / (fn + tn + 0.00001),
                    ))) + "\n")

            last_weight = weight

        if passed: counts.passed += 1
        else: counts.failed += 1

        link = link._replace(qdomain=toDomain((qpid, qstart, qend)),
                             sdomain=toDomain((spid, sstart, send)))

        outf.write( "%s\t%s\t%s\t%s\t%s\n" % \
                        ("\t".join( map(str,link) ),
                         ",".join( sorted(set([x[0] for x in qfamily])) ),
                         ",".join( sorted(set([x[0] for x in sfamily])) ),
                         ",".join("%s_%i_%i" % x for x in qfamily ),
                         ",".join("%s_%i_%i" % x for x in sfamily )))
    inf.close()
    outf_stats.write("\t".join(
        map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) /
                  (tp + fp), float(fn) / (fn + tn)))) + "\n")

    outf_stats.close()
    E.info("%s" % str(counts))
예제 #15
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--dialect",
                      dest="dialect",
                      type="string",
                      help="csv dialect to use [default=%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="string",
        action="append",
        help=
        "explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]."
    )

    parser.add_option("-t",
                      "--table",
                      dest="tablename",
                      type="string",
                      help="table name for all backends [default=%default].")

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="database name for sqlite3 [default=%default].")

    parser.add_option("-l",
                      "--lowercase",
                      dest="lowercase",
                      action="store_true",
                      help="force lower case column names [default=%default].")

    parser.add_option(
        "-u",
        "--ignore-duplicates",
        dest="ignore_duplicates",
        action="store_true",
        help="ignore columns with duplicate names [default=%default].")

    parser.add_option(
        "-s",
        "--ignore-same",
        dest="ignore_same",
        action="store_true",
        help="ignore columns with identical values [default=%default].")

    parser.add_option(
        "-e",
        "--ignore-empty",
        dest="ignore_empty",
        action="store_true",
        help="ignore columns which are all empty [default=%default].")

    parser.add_option(
        "-q",
        "--quick",
        dest="insert_quick",
        action="store_true",
        help=
        "try quick file based import - needs to be supported by the backend [default=%default]."
    )

    parser.add_option("-b",
                      "--backend",
                      dest="backend",
                      type="choice",
                      choices=("pg", "sqlite", "mysql"),
                      help="database backend to choose [default=%default].")

    parser.add_option(
        "-i",
        "--index",
        dest="indices",
        type="string",
        action="append",
        help="create an index for the named column [default=%default].")

    parser.add_option("-a",
                      "--allow-empty",
                      dest="allow_empty",
                      action="store_true",
                      help="allow empty table [default=%default].")

    parser.add_option("--force-single",
                      dest="force_single",
                      action="store_true",
                      help="force upload line by line [default=%default].")

    parser.set_defaults(
        map=[],
        dialect="excel-tab",
        database="csvdb",
        lowercase=False,
        tablename="csv",
        from_file=False,
        ignore_duplicates=False,
        ignore_identical=False,
        ignore_empty=False,
        insert_many=False,
        force_single=False,
        guess_size=1000,
        report_step=10000,
        backend="pg",
        indices=[],
        missing_values=(
            "na",
            "NA",
        ),
        insert_quick=False,
        allow_empty=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_mysql_options=True)

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    index_mangle = str
    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect(options.psql_connection)
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database)
        error = sqlite3.OperationalError
        options.insert_many = not options.force_single
        options.null = None  # "NULL"
        options.string_value = "%s"  # "'%s'"

    elif options.backend == "mysql":
        import MySQLdb, _mysql
        error = (_mysql.OperationalError, _mysql.ProgrammingError)
        if options.port:
            dbhandle = MySQLdb.connect(host=options.host,
                                       user=options.user,
                                       passwd=options.password,
                                       db=options.database,
                                       port=options.port)
        else:
            dbhandle = MySQLdb.connect(host=options.host,
                                       user=options.user,
                                       passwd=options.password,
                                       db=options.database,
                                       unix_socket=options.socket)

        options.insert_many = False  # not options.force_single, fails with error
        options.null = "NULL"
        options.string_value = "'%s'"
        index_mangle = lambda x: re.sub("[.]", "_", x)

    reader = CSV.DictReader(sys.stdin, dialect=options.dialect)

    rows = []
    for row in reader:

        try:
            rows.append(CSV.ConvertDictionary(row, map=options.map))
        except TypeError, msg:
            E.warn(
                "incomplete line? Type error in conversion: '%s' with data: %s"
                % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break
예제 #16
0
        try:
            id, acc, len = rx_head.match( record[0] ).groups()
        except AttributeError, msg:
            E.warn( "parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith( "Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match( line ).groups()
                
            for c in coordinates.split( " "):
                start,end = [ int(x) for x in c.split("-") ]
                start -= 1
                options.stdout.write( options.prefix + "\t".join( map(str, (acc, start, end, family) ) ) + "\n" )
                ndomains += 1
            noutput += 1

    E.info( "ninput=%i, noutput=%i, ndomains=%i, nerrors=%i" % (ninput, noutput, ndomains, nskipped))

    E.Stop()
    
                
if __name__ == "__main__":    
    sys.exit(main())



예제 #17
0
파일: csv2db.py 프로젝트: AndreasHeger/adda
def main():

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE)

    parser.add_option( "--dialect", dest="dialect", type="string",
                      help="csv dialect to use [default=%default]." )

    parser.add_option("-m", "--map", dest="map", type="string", action="append",
                      help="explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." )

    parser.add_option("-t", "--table", dest="tablename", type="string",
                      help="table name for all backends [default=%default]." )

    parser.add_option("-d", "--database", dest="database", type="string",
                      help="database name for sqlite3 [default=%default]." )

    parser.add_option("-l", "--lowercase", dest="lowercase", action="store_true",
                      help="force lower case column names [default=%default]." )

    parser.add_option("-u", "--ignore-duplicates", dest="ignore_duplicates", action="store_true",
                      help="ignore columns with duplicate names [default=%default]." )

    parser.add_option("-s", "--ignore-same", dest="ignore_same", action="store_true",
                      help="ignore columns with identical values [default=%default]." )
    
    parser.add_option("-e", "--ignore-empty", dest="ignore_empty", action="store_true",
                      help="ignore columns which are all empty [default=%default]." )

    parser.add_option("-q", "--quick", dest="insert_quick", action="store_true",
                      help="try quick file based import - needs to be supported by the backend [default=%default]." )

    parser.add_option("-b", "--backend", dest="backend", type="choice",
                      choices=("pg", "sqlite", "mysql" ),
                      help="database backend to choose [default=%default]." )

    parser.add_option("-i", "--index", dest="indices", type="string", action="append",
                      help="create an index for the named column [default=%default]." )

    parser.add_option("-a", "--allow-empty", dest="allow_empty", action="store_true",
                      help="allow empty table [default=%default]." )

    parser.add_option("--force-single", dest="force_single", action="store_true",
                      help="force upload line by line [default=%default]." )

    parser.set_defaults(
        map = [],
        dialect = "excel-tab",
        database = "csvdb",
        lowercase = False,
        tablename = "csv",
        from_file = False,
        ignore_duplicates= False,
        ignore_identical = False,
        ignore_empty = False,
        insert_many = False,
        force_single = False,
        guess_size = 1000,
        report_step = 10000,
        backend="pg",
        indices = [],
        missing_values = ("na", "NA", ),
        insert_quick = False,
        allow_empty = False,
        )

    (options, args) = E.Start( parser, 
                               add_psql_options = True,
                               add_mysql_options = True )

    options.tablename = quoteTableName( options.tablename, backend = options.backend )
    
    if options.map:
        m = {}
        for x in options.map:
            f,t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    index_mangle = str
    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect( options.psql_connection )
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect( options.database )
        error = sqlite3.OperationalError
        options.insert_many = not options.force_single
        options.null = None # "NULL" 
        options.string_value = "%s" # "'%s'"

    elif options.backend == "mysql":
        import MySQLdb, _mysql
        error = (_mysql.OperationalError, _mysql.ProgrammingError )
        if options.port:
            dbhandle = MySQLdb.connect(host        = options.host,
                                       user        = options.user,
                                       passwd      = options.password,
                                       db          = options.database,
                                       port        = options.port )
        else:
            dbhandle = MySQLdb.connect(host        = options.host,
                                       user        = options.user,
                                       passwd      = options.password,
                                       db          = options.database,
                                       unix_socket = options.socket )
            
        options.insert_many = False # not options.force_single, fails with error
        options.null = "NULL" 
        options.string_value = "'%s'"
        index_mangle = lambda x: re.sub("[.]", "_", x )

    reader = CSV.DictReader( sys.stdin, dialect=options.dialect )

    rows = []
    for row in reader:

        try:
            rows.append( CSV.ConvertDictionary( row , map=options.map ))
        except TypeError, msg:
            E.warn( "incomplete line? Type error in conversion: '%s' with data: %s" % (msg, str(row) ) )

        if len(rows) >= options.guess_size:
            break
예제 #18
0
파일: csv2db.py 프로젝트: AndreasHeger/adda
    ## delete old table if it exists
    try:
        cc = dbhandle.cursor()
        cc.execute("DROP TABLE %s" % options.tablename)
        cc.close()
        if options.loglevel >= 1:
            options.stdlog.write( "# existing table %s deleted\n" % options.tablename )
    except error, msg:
        dbhandle.rollback()
    except error, msg:
        pass

    ## create new table
    statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join( columns))

    E.debug( "table create:\n# %s" % (statement ) )
        
    try:
        cc = dbhandle.cursor()
        cc.execute(statement)
        cc.close()
    except error, msg:
        options.stderr.write( "table creation failed: statement=\n  %s\n" % (statement ) )
        raise error, msg

    E.info("table %s created successfully." % options.tablename )
    
    return take, map_column2type, ignored

def main():
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option("-D", "--database", dest="database", type="string",          
                      help="tablename to use [default=%default]."  )
    
    parser.add_option("-t", "--trees", dest="table_name_trees", type="string",          
                      help="tablename with trees [default=%default]."  )

    parser.add_option("-r", "--parts", dest="table_name_parts", type="string",          
                      help="tablename with trees [default=%default]."  )

    parser.add_option("-b", "--bench", dest="table_name_bench", type="string",          
                      help="domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]."  )

    parser.add_option("-f", "--reference", dest="table_name_reference", type="string",          
                      help="table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]."  )

    parser.add_option( "--bin-size", dest="bin_size", type="int",          
                      help="bin size [default=%default]."  )

    parser.add_option( "-o", "--resolution", dest="resolution", type="float",          
                      help="resolution for scaling of domains [default=%default]."  )

    parser.add_option("-s", "--switch", dest="switch", action = "store_true",
                      help="switch between coverage of reference and size ratio if coverage is 1 [default=%default]."  )

    parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option("-m", "--skip-tms", dest="skip_tms", action = "store_true",
                      help="discard domains which contain transmembrane regions [default=%default]."  )

    parser.add_option("-e", "--check-selection", dest="check_selection", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option("-q", "--quality", dest="quality", action = "store_true",
                      help="take only sequences which are curated [default=%default]."  )

    parser.add_option( "--no-full-length", dest="no_full_length", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option( "--only-full-length", dest="only_full_length", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option( "--check-if-comparable", dest="check_if_comparable", action = "store_true",
                      help="perform comparable check according to Islam95 (default level 85%) [default=%default]."  )

    parser.add_option( "--subset", dest="subset", type = "string",
                       help = "use only a subset of nids [default=%default]" )

    parser.set_defaults( 
        database = "pairsdb",
        table_name_reference = None,
        table_name_trees = None,
        table_name_parts = None,
        table_name_bench = None,
        resolution = None,
        loglevel = 1,
        min_overlap = 1,
        switch = 0,
        combine_repeats = 1,
        skip_repeats = 0,
        skip_tms = 0,
        discard_full_length = 0,
        check_selection = 0,
        selection_threshold = 0.9,
        quality = None,
        no_full_length = None,
        only_full_length = None,
        ## a full length domain should cover at least 90% of a sequence
        min_length_ratio = 0.9,
        check_comparable = None,
        check_comparable_level = 0.85,
        bin_size = 1,
        subset = None )

    (options, args) = E.Start( parser, 
                               argv = argv, 
                               add_output_options = True )

    dbhandle = Pairsdb()
    dbhandle.Connect( dbname =  options.database )
    
    tbl_reference = TableDomains(dbhandle, "generic")
    tbl_reference.SetName(options.table_name_reference)
    
    # tbl_masks = Table_nrdb90_masks(dbhandle)
    tbl_nrdb = Table_nrdb( dbhandle )

    # todo: encapsulate this with a parameter
    tbl_nrdb.name = "nrdb40"

    if options.table_name_trees:

        nids_statement = '''SELECT DISTINCT t.nid 
                            FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\
                         (options.table_name_trees, 
                          options.table_name_reference)

        if options.quality:
            nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'")
        else:
            nids_statement = nids_statement % ("","")
            
        statement = """
        SELECT t.node, t.parent, t.level, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """ 

        tablename = options.table_name_trees
        
    elif options.table_name_parts or options.table_name_bench:

        if options.table_name_parts:
            table_name = options.table_name_parts
        else:
            table_name = options.table_name_bench

        if options.subset:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, %s AS t 
                                WHERE t.nid = s.nid''' % (options.subset, table_name)
        else:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, 
                                     %s AS r %%s 
                                 WHERE r.nid = s.nid %%s''' %\
                             (table_name, options.table_name_reference)

            if options.quality:
                nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'")
            else:
                nids_statement = nids_statement % ("","")

        statement = """
        SELECT 1, 0, 0, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = table_name

    else:
        print "what shall I compare?"
        sys.exit(1)

    if options.check_selection:
        selection_statement = """
        SELECT t.domain_from, t.domain_to,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (t.domain_to - t.domain_from)i) AS cov_dom,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (%(end)i - %(start)i)) AS cov_ref,
        ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(selection_tablename)s AS t
        WHERE t.domain_nid = %(nid)i
        AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """
        selection_tablename = options.table_name_parts

        options.table_name_parts = None
        
        parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees =  0,0,0,0

    min_overlap = options.min_overlap    

    nids = map(lambda x:x[0], dbhandle.Execute(nids_statement).fetchall())

    overlaps = []
    cov_doms = []
    cov_refs = []
    touched  = {}

    if options.check_selection:
        options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" )
    else:
        options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" )

    E.info( "--> processing %i nids" % len(nids) )

    nskipped_no_assignments = 0
    nskipped_no_overlap = 0
    nskipped_wrong_domaintype = 0
    nfound = 0
    
    it = 0
    for nid in nids:

        it += 1

        E.debug( "--> processing %i" % nid )

        domains = tbl_reference.GetDomainBoundariesForNid( nid )

        length = tbl_nrdb.GetLength( nid )
        
        if not domains:
            nskipped_no_assignments +=1
            continue

        if options.no_full_length and len(domains) == 1:
            ## check if domain is actually full length, otherwise keep
            id, domain_from, domain_to = domains[0]
            if float(domain_to-domain_from) / float(length) >= options.min_length_ratio:
                nskipped_wrong_domaintype += 1
                continue
            
        if options.only_full_length:
            if len(domains) == 1:
                id, domain_from, domain_to = domains[0]
                if float(domain_to-domain_from) / float(length) <= options.min_length_ratio:
                    nskipped_wrong_domaintype += 1
                    continue
            else:
                nskipped_wrong_domaintype += 1                
                continue

        nfound += 1
        
        last_id = None
        x = 0

        # iteration over domains in reference
        while x < len(domains):
            
            id, domain_from, domain_to = domains[x]
                
            ##########################################################
            # process repeats
            is_repeat = -1
            
            while x < len(domains) and domains[x][0] == id:
                domain_to = domains[x][2]
                x += 1
                is_repeat += 1

            if options.skip_repeats and is_repeat:
                continue

            # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to):
            #    continue

            ##########################################################
            ## apply resolution
            if options.resolution:
                start = int(float(domain_from-1)/options.resolution)
                end   = int(float(domain_to-1)/options.resolution) + 1
            else:
                start = domain_from
                end   = domain_to

            E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \
                         ( id, domain_from, domain_to, start, end))

            ##########################################################
            ## get best matching domain
            s = statement % locals() 

            if options.loglevel >= 4: print s
            
            result = dbhandle.Execute(s).fetchone()
            
            if not result: continue

            node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result

            key = "%i-%s-%i-%i" % (nid, id, start, end)
            if touched.has_key(key):
                continue
            else:
                touched[key] = 1

            # discard full length domains
            if options.discard_full_length:
                if options.table_name_trees:            
                    if node == 0: continue
                else:
                    if length == end - start: continue
            
            if options.switch and cov_ref == 1.0:
                xcov_ref = rat_ref
            else:
                xcov_ref = cov_ref
                
            # check, if selection did take a domain lower or further up
            if options.check_selection:
                start = (start * 10) + 1
                end   = min(end * 10 + 1, length)

                s = selection_statement % locals()
                result = dbhandle.Execute(s).fetchone()

                if result:
                    parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result


                    if rat_parts > 1.0:
                        parts_larger_than_trees += 1
                        token = ">"
                    elif rat_parts == 1.0:
                        parts_same_as_trees += 1
                        token = "="
                    else:
                        parts_smaller_than_trees += 1
                        token = "<"
                        if rat_parts < options.selection_threshold:
                            parts_much_smaller_than_trees += 1

                    options.stdout.write(string.join(map(str, (nid,
                                                               id, domain_from, domain_to,
                                                               level,
                                                               yfrom, yto,
                                                               parts_from, parts_to,
                                                               overlap, cov_dom, cov_ref, rat_ref, xcov_ref,
                                                               ovl_parts, cov_parts, cov_tree, rat_parts,
                                                               token)), "\t") + "\n")
                    
            else:
                options.stdout.write(string.join(map(str, (nid, node, parent, level, start, end,
                                                           id,
                                                           start, end,
                                                           overlap, cov_dom, cov_ref, 
                                                           rat_ref, xcov_ref)), "\t") + "\n")
                
                overlaps.append( int(overlap * 100) )
                cov_doms.append( int(cov_dom * 100) )
                cov_refs.append( int(xcov_ref * 100) )            


    E.info( "skipped nids because of no overlap with reference: %i" % nskipped_no_overlap )
    E.info( "skipped nids because of no assignments: %i" % nskipped_no_assignments )
    E.info( "skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype)
    E.info( "nids in comparison: %i" % nfound)
        
    if options.check_selection:
        E.info( " parts larger than trees=", parts_larger_than_trees )
        E.info( " parts like trees=", parts_same_as_trees )
        E.info( " parts smaller than trees=", parts_smaller_than_trees )
        E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees )
    else:
        outfile_stats = E.openOutputFile( "stats" )
        outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader())
        outfile_stats.write("overlaps\t%s\n" % str( Stats.Summary( overlaps ) ) )
        outfile_stats.write("domain_coverage\t%s\n" % str( Stats.Summary( cov_doms ) ) )
        outfile_stats.write("reference_coverage\t%s\n" % str( Stats.Summary( cov_refs ) ) )
        outfile_stats.close()

        outfile = E.openOutputFile( "overlaps.histogram" )
        outfile.write( "bin\tcounts\n")
        Histogram.Write(outfile, 
                        Histogram.Calculate( overlaps, 
                                             min_value=0, 
                                             increment=1, 
                                             no_empty_bins = True))
        outfile.close()

        outfile = E.openOutputFile( "domain_coverage.histogram" )
        outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" )
        Histogram.Write(outfile,
                        Histogram.AddRelativeAndCumulativeDistributions(
                        Histogram.Calculate( cov_doms, 
                                             min_value=0, 
                                             increment=options.bin_size, 
                                             no_empty_bins = True)))
        outfile.close()

        outfile = E.openOutputFile( "reference_coverage.histogram" )
        outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" )
        Histogram.Write(outfile,
                        Histogram.AddRelativeAndCumulativeDistributions(
                    Histogram.Calculate( cov_refs, 
                                         min_value=0, 
                                         increment=options.bin_size, 
                                         no_empty_bins = True)))
                        
        outfile.close()
    
    E.Stop()
예제 #20
0
def main():
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", 
                                    usage = globals()["__doc__"])

    parser.add_option( "-n", "--nids", dest="filename_nids", type="string",
                       help="filename with nids[default=%default].")

    parser.add_option( "-c", "--column", dest="columns", type="int", action="append",
                       help="columns with nids to translate (1-based) [default=%default].")

    parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true",
                       help="translate domain ids [default=%default].")

    parser.add_option( "-i", "--invert", dest="invert", action="store_true",
                       help="invert mapping [default=%default].")

    parser.add_option( "-e", "--no-header", dest="no_header", action="store_true",
                       help="file has no header [default=%default].")

    parser.set_defaults( 
        filename_nids = "adda.nids",
        columns = [],
        is_domains = False,
        invert = False,
        noheader = False,
        )
    
    (options, args) = E.Start( parser )
    
    map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") )
    if options.invert:
        E.info( "inverting mapping" )
        map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] )

    if len(options.columns) == 0: options.columns = [1]
    columns = [x-1 for x in options.columns ]

    toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain
    first = not options.no_header
    is_domains = options.is_domains
    ninput, noutput, nskipped = 0, 0, 0
    for line in options.stdin:
        if line.startswith("#"):
            options.stdout.write(line)
            continue

        if first:
            options.stdout.write(line)
            first = False
            continue
        
        ninput += 1

        data = line[:-1].split("\t")
        for x in columns:
            if is_domains:
                try:
                    d = toTuple(data[x])
                except ValueError:
                    E.warn( "could not parse domain `%s`" % data[x])
                    nskipped += 1
                    break

                try:
                    data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) )
                except (IndexError, KeyError):
                    E.warn( "could not map domain `%s`" % data[x])
                    nskipped += 1
                    break
            else:
                try:
                    data[x] = str(map_nid2pid[int(data[x])])
                except IndexError:
                    E.warn( "could not map nid `%s`" % data[x])
                    nskipped += 1
                    break
        else:
            options.stdout.write("%s\n" % "\t".join(data))
            noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
    E.Stop()
예제 #21
0
            id, acc, len = rx_head.match(record[0]).groups()
        except AttributeError, msg:
            E.warn("parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith("Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match(
                line).groups()

            for c in coordinates.split(" "):
                start, end = [int(x) for x in c.split("-")]
                start -= 1
                options.stdout.write(
                    options.prefix +
                    "\t".join(map(str, (acc, start, end, family))) + "\n")
                ndomains += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, ndomains=%i, nerrors=%i" %
           (ninput, noutput, ndomains, nskipped))

    E.Stop()


if __name__ == "__main__":
    sys.exit(main())
예제 #22
0
def main():
    global L
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE )

    parser.add_option( "--config", dest="filename_config", type="string",
                      help="configuration file [default=%default].")

    parser.add_option( "--force", dest="force", action="store_true",
                      help="overwrite existing files [default=%default].")

    parser.add_option( "--continue", dest="append", action="store_true",
                      help="continue from an aborted run and append to existing files [default=%default].")

    parser.add_option( "--test", dest="test", type="int",
                      help="run a test with first # sequences [default=%default]")

    parser.add_option( "--num-jobs", dest="num_jobs", type="int",
                      help="use # processes. If not set, the number of CPUs/cores is taken [default=%default]")
    
    parser.add_option( "--chunks", dest="chunks", type="string",
                       help = "work on one or more chunks only. Provide a comma-separated list. [default=%default]" )

    parser.add_option( "--command", dest="command", type="choice",
                       choices=(
                                "sequences",
                                "blast",
                                "fit", 
                                "graph",
                                "index",
                                "check-index",
                                "profiles",
                                "segment", 
                                "optimise",
                                "convert",
                                "mst", 
                                "mst-components", 
                                "align",
                                "cluster", 
                                "realign",
                                "families", 
                                "stats",
                                "summary"),
                       help="perform a command [default=%default]" )

    parser.add_option( "--start-at", dest="start_at", type="string",
                      help="start at sequence [default=%default]")

    parser.add_option( "--stop-at", dest="stop_at", type="string",
                      help="stop at sequenec [default=%default]")

    parser.set_defaults( 
                        filename_config = "adda.ini",
                        command = None,
                        start_at = None,
                        stop_at = None,
                        force = False,
                        append = False,
                        test = None,
                        num_jobs = None,
                        chunks = "all",
                        )
    
    (options, args) = E.Start( parser )

    # setup logging
    if options.loglevel == 0:
        lvl = logging.ERROR
    elif options.loglevel == 1:
        lvl = logging.INFO
    else:
        lvl = logging.DEBUG

    logQueue = multiprocessing.Queue(100)
    handler = Logger.MultiProcessingLogHandler(logging.FileHandler( "adda.log", "a"), logQueue)
    handler.setFormatter( 
        logging.Formatter( '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s',
                           datefmt='%m-%d %H:%M' ) )
    logging.getLogger('adda').addHandler(handler)
    logging.getLogger('adda').setLevel( lvl )

    E.setLogger( logging.getLogger( "adda" ) )
    L = logging.getLogger( "adda" ) 

    config = AddaIO.ConfigParser()
    config.read( os.path.expanduser( options.filename_config ) )
    
    if len(args) == 0:
        if not options.command: raise ValueError("specify at least one command")
    elif len(args) == 1:
        options.command = args[0]
    else: 
        raise ValueError("one command line argument is sufficient.")        

    ## collect modules and initialise them         
    map_module = { 'fit' : AddaFit.AddaFit,
                   'segment' : AddaSegment.AddaSegment,
                   'blast' : AddaBlast.AddaBlast,
                   'graph' : AddaGraph.AddaGraph,
                   'stats' : AddaStats.AddaStats,
                   'profiles' : AddaProfiles.AddaProfiles, 
                   'realign' : AddaAlign.AddaRealign,
                   'index' : AddaIndex.AddaIndexBuild,
                   'check-index' : AddaIndex.AddaIndexCheck,
                   'optimise' : AddaOptimise.AddaOptimise,  
                   'sequences' : AddaSequences.AddaSequences,
                   'convert' : AddaConvert.AddaConvert,
                   'mst' : AddaMst.AddaMst, 
                   'mst-components' : AddaComponentsMst.AddaComponentsMst, 
                   'align' : AddaAlign.AddaAlign, 
                   'cluster' : AddaCluster.AddaCluster,
                   'families' : AddaFamilies.AddaFamilies,
                   'summary' : AddaSummary.AddaSummary,
                   }

    try:
        fasta = IndexedFasta.IndexedFasta( config.get( "files", "output_fasta", "adda" ) )
    except KeyError:
        fasta = None
    
    if options.num_jobs == 1: 
        run_parallel = runSequentially
    else:
        run_parallel = runParallel

    kwargs = {
        "loglevel" : options.loglevel,
        "append" : options.append,
        "force": options.force }

    if options.command == "index":
        module = map_module[options.command](config, fasta = fasta, **kwargs )
        if module.isComplete():
            E.info("output of command `%s` present and complete" % options.command )
        else:
            filename_graph = config.get( "files", "input_graph", "pairsdb_40x40.links.gz")
            if "," in filename_graph:
                filename_graph = filename_graph.split(",")
                # permit parallel processing of multiple files
                run_parallel( 
                    run_on_files,
                    filename = filename_graph,
                    options = options,
                    module = map_module[options.command],
                    config = config,
                    kwargs = kwargs,
                    )
                
                nchunks = len( filename_graph )

                module = map_module[options.command]( config, 
                                                      chunk = 0,
                                                      num_chunks = nchunks, 
                                                      **kwargs )
                
                if not module.isComplete():                 
                    L.info( "merging" )

                    if not module.merge():
                        raise ValueError("error while merging for `%s`" % options.command )

            else:
                # process single file - no hazzle.
                module.startUp()
                module.run()
                module.finish()

    if options.command in ("sequences", "stats", 
                           "optimise",
                           "convert", 
                           "mst", "mst-components", "cluster", "families",
                           "summary" ):
        module = map_module[options.command]( config, 
                                              fasta = fasta,
                                              **kwargs )
        if module.isComplete():
            E.info("output of command `%s` present and complete" % options.command )
        else:
            module.startUp()
            module.run()
            module.finish()

    elif options.command in ("fit", "segment"): 

        run_on_graph = RunOnGraph( config, options.command )

        run_parallel( 
            run_on_graph,
            filename = config.get( "files", "input_graph", "adda.graph" ),
            options = options,
            module = map_module[options.command],
            config = config,
            kwargs = kwargs )
        
        if not merge( options,
                      module = map_module[options.command],
                      config = config,
                      fasta = fasta ):
            E.Stop()
            return

    elif options.command in ("align" ):

        run_parallel( 
            run_on_file,
            filename = config.get( "files", "output_mst", "adda.mst" ),
            options = options,
            module = map_module[options.command],
            config = config,
            kwargs = kwargs )

        merge( options,
               module = map_module[options.command],
               config = config,
               fasta = fasta )

    elif options.command in ("realign" ):

        run_parallel( 
            run_on_file,
            filename = config.get( "files", "output_align", "adda.align" ),
            options = options,
            module = map_module[options.command],
            config = config,
            kwargs = kwargs )

        merge( options,
               module = map_module[options.command],
               config = config,
               fasta = fasta )
예제 #23
0
                continue
            except IndexError:
                continue
            
            if mi == None: 
                mi = v
            else:
                mi = min(v, mi)
            if ma == None:
                ma = v
            else:
                ma = max(v, ma)

        options.min_weight = mi
        options.max_weight = ma
        E.info( "using automatic weight range from %f to %f\n" % (options.min_weight, options.max_weight) )
    else:
        lines = sys.stdin
        options.min_weight, options.max_weight = map(float, options.weight_range.split(","))

    options.stdout.write( "graph: {\n" )
    
    if options.filename_format_graph:
        options.stdout.write( string.join(open(options.filename_format_graph, "r").readlines() ) )
    else:
        options.stdout.write( FORMAT_GRAPH )

        if options.add_edge_labels:
            options.stdout.write( "display_edge_labels: yes\n" )
        
    left_nodes = {}
def main(argv=sys.argv):

    parser = optparse.OptionParser(version="%prog version: $Id$",
                                   usage=globals()["__doc__"])

    parser.add_option("-D",
                      "--database",
                      dest="database",
                      type="string",
                      help="tablename to use [default=%default].")

    parser.add_option("-t",
                      "--trees",
                      dest="table_name_trees",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option("-r",
                      "--parts",
                      dest="table_name_parts",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option(
        "-b",
        "--bench",
        dest="table_name_bench",
        type="string",
        help=
        "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]."
    )

    parser.add_option(
        "-f",
        "--reference",
        dest="table_name_reference",
        type="string",
        help=
        "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]."
    )

    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size [default=%default].")

    parser.add_option(
        "-o",
        "--resolution",
        dest="resolution",
        type="float",
        help="resolution for scaling of domains [default=%default].")

    parser.add_option(
        "-s",
        "--switch",
        dest="switch",
        action="store_true",
        help=
        "switch between coverage of reference and size ratio if coverage is 1 [default=%default]."
    )

    parser.add_option("-k",
                      "--skip-repeats",
                      dest="skip_repeats",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-m",
        "--skip-tms",
        dest="skip_tms",
        action="store_true",
        help=
        "discard domains which contain transmembrane regions [default=%default]."
    )

    parser.add_option("-e",
                      "--check-selection",
                      dest="check_selection",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-q",
        "--quality",
        dest="quality",
        action="store_true",
        help="take only sequences which are curated [default=%default].")

    parser.add_option("--no-full-length",
                      dest="no_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option("--only-full-length",
                      dest="only_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "--check-if-comparable",
        dest="check_if_comparable",
        action="store_true",
        help=
        "perform comparable check according to Islam95 (default level 85%) [default=%default]."
    )

    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="use only a subset of nids [default=%default]")

    parser.set_defaults(
        database="pairsdb",
        table_name_reference=None,
        table_name_trees=None,
        table_name_parts=None,
        table_name_bench=None,
        resolution=None,
        loglevel=1,
        min_overlap=1,
        switch=0,
        combine_repeats=1,
        skip_repeats=0,
        skip_tms=0,
        discard_full_length=0,
        check_selection=0,
        selection_threshold=0.9,
        quality=None,
        no_full_length=None,
        only_full_length=None,
        ## a full length domain should cover at least 90% of a sequence
        min_length_ratio=0.9,
        check_comparable=None,
        check_comparable_level=0.85,
        bin_size=1,
        subset=None)

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    dbhandle = Pairsdb()
    dbhandle.Connect(dbname=options.database)

    tbl_reference = TableDomains(dbhandle, "generic")
    tbl_reference.SetName(options.table_name_reference)

    # tbl_masks = Table_nrdb90_masks(dbhandle)
    tbl_nrdb = Table_nrdb(dbhandle)

    # todo: encapsulate this with a parameter
    tbl_nrdb.name = "nrdb40"

    if options.table_name_trees:

        nids_statement = '''SELECT DISTINCT t.nid 
                            FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\
                         (options.table_name_trees,
                          options.table_name_reference)

        if options.quality:
            nids_statement = nids_statement % (
                ", nrdb_quality AS q",
                "AND q.nid = s.nid AND q.is_curated = 'T'")
        else:
            nids_statement = nids_statement % ("", "")

        statement = """
        SELECT t.node, t.parent, t.level, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = options.table_name_trees

    elif options.table_name_parts or options.table_name_bench:

        if options.table_name_parts:
            table_name = options.table_name_parts
        else:
            table_name = options.table_name_bench

        if options.subset:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, %s AS t 
                                WHERE t.nid = s.nid''' % (options.subset,
                                                          table_name)
        else:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, 
                                     %s AS r %%s 
                                 WHERE r.nid = s.nid %%s''' %\
                             (table_name, options.table_name_reference)

            if options.quality:
                nids_statement = nids_statement % (
                    ", nrdb_quality AS q",
                    "AND q.nid = s.nid AND q.is_curated = 'T'")
            else:
                nids_statement = nids_statement % ("", "")

        statement = """
        SELECT 1, 0, 0, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = table_name

    else:
        print "what shall I compare?"
        sys.exit(1)

    if options.check_selection:
        selection_statement = """
        SELECT t.domain_from, t.domain_to,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (t.domain_to - t.domain_from)i) AS cov_dom,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (%(end)i - %(start)i)) AS cov_ref,
        ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(selection_tablename)s AS t
        WHERE t.domain_nid = %(nid)i
        AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """
        selection_tablename = options.table_name_parts

        options.table_name_parts = None

        parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0

    min_overlap = options.min_overlap

    nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall())

    overlaps = []
    cov_doms = []
    cov_refs = []
    touched = {}

    if options.check_selection:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )
    else:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )

    E.info("--> processing %i nids" % len(nids))

    nskipped_no_assignments = 0
    nskipped_no_overlap = 0
    nskipped_wrong_domaintype = 0
    nfound = 0

    it = 0
    for nid in nids:

        it += 1

        E.debug("--> processing %i" % nid)

        domains = tbl_reference.GetDomainBoundariesForNid(nid)

        length = tbl_nrdb.GetLength(nid)

        if not domains:
            nskipped_no_assignments += 1
            continue

        if options.no_full_length and len(domains) == 1:
            ## check if domain is actually full length, otherwise keep
            id, domain_from, domain_to = domains[0]
            if float(domain_to -
                     domain_from) / float(length) >= options.min_length_ratio:
                nskipped_wrong_domaintype += 1
                continue

        if options.only_full_length:
            if len(domains) == 1:
                id, domain_from, domain_to = domains[0]
                if float(domain_to - domain_from) / float(
                        length) <= options.min_length_ratio:
                    nskipped_wrong_domaintype += 1
                    continue
            else:
                nskipped_wrong_domaintype += 1
                continue

        nfound += 1

        last_id = None
        x = 0

        # iteration over domains in reference
        while x < len(domains):

            id, domain_from, domain_to = domains[x]

            ##########################################################
            # process repeats
            is_repeat = -1

            while x < len(domains) and domains[x][0] == id:
                domain_to = domains[x][2]
                x += 1
                is_repeat += 1

            if options.skip_repeats and is_repeat:
                continue

            # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to):
            #    continue

            ##########################################################
            ## apply resolution
            if options.resolution:
                start = int(float(domain_from - 1) / options.resolution)
                end = int(float(domain_to - 1) / options.resolution) + 1
            else:
                start = domain_from
                end = domain_to

            E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \
                         ( id, domain_from, domain_to, start, end))

            ##########################################################
            ## get best matching domain
            s = statement % locals()

            if options.loglevel >= 4: print s

            result = dbhandle.Execute(s).fetchone()

            if not result: continue

            node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result

            key = "%i-%s-%i-%i" % (nid, id, start, end)
            if touched.has_key(key):
                continue
            else:
                touched[key] = 1

            # discard full length domains
            if options.discard_full_length:
                if options.table_name_trees:
                    if node == 0: continue
                else:
                    if length == end - start: continue

            if options.switch and cov_ref == 1.0:
                xcov_ref = rat_ref
            else:
                xcov_ref = cov_ref

            # check, if selection did take a domain lower or further up
            if options.check_selection:
                start = (start * 10) + 1
                end = min(end * 10 + 1, length)

                s = selection_statement % locals()
                result = dbhandle.Execute(s).fetchone()

                if result:
                    parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result

                    if rat_parts > 1.0:
                        parts_larger_than_trees += 1
                        token = ">"
                    elif rat_parts == 1.0:
                        parts_same_as_trees += 1
                        token = "="
                    else:
                        parts_smaller_than_trees += 1
                        token = "<"
                        if rat_parts < options.selection_threshold:
                            parts_much_smaller_than_trees += 1

                    options.stdout.write(
                        string.join(
                            map(str, (nid, id, domain_from, domain_to, level,
                                      yfrom, yto, parts_from, parts_to,
                                      overlap, cov_dom, cov_ref, rat_ref,
                                      xcov_ref, ovl_parts, cov_parts, cov_tree,
                                      rat_parts, token)), "\t") + "\n")

            else:
                options.stdout.write(
                    string.join(
                        map(str, (nid, node, parent, level, start, end, id,
                                  start, end, overlap, cov_dom, cov_ref,
                                  rat_ref, xcov_ref)), "\t") + "\n")

                overlaps.append(int(overlap * 100))
                cov_doms.append(int(cov_dom * 100))
                cov_refs.append(int(xcov_ref * 100))

    E.info("skipped nids because of no overlap with reference: %i" %
           nskipped_no_overlap)
    E.info("skipped nids because of no assignments: %i" %
           nskipped_no_assignments)
    E.info("skipped nids because of wrong domain type: %i" %
           nskipped_wrong_domaintype)
    E.info("nids in comparison: %i" % nfound)

    if options.check_selection:
        E.info(" parts larger than trees=", parts_larger_than_trees)
        E.info(" parts like trees=", parts_same_as_trees)
        E.info(" parts smaller than trees=", parts_smaller_than_trees)
        E.info(
            " parts much smaller than trees (<%f)=" %
            options.selection_threshold, parts_much_smaller_than_trees)
    else:
        outfile_stats = E.openOutputFile("stats")
        outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader())
        outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps)))
        outfile_stats.write("domain_coverage\t%s\n" %
                            str(Stats.Summary(cov_doms)))
        outfile_stats.write("reference_coverage\t%s\n" %
                            str(Stats.Summary(cov_refs)))
        outfile_stats.close()

        outfile = E.openOutputFile("overlaps.histogram")
        outfile.write("bin\tcounts\n")
        Histogram.Write(
            outfile,
            Histogram.Calculate(overlaps,
                                min_value=0,
                                increment=1,
                                no_empty_bins=True))
        outfile.close()

        outfile = E.openOutputFile("domain_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_doms,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))
        outfile.close()

        outfile = E.openOutputFile("reference_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_refs,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))

        outfile.close()

    E.Stop()