Ejemplo n.º 1
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--format", dest="graph-format", type="choice",
                       choices=("alignments",),
                       help="graph format [default=%default].")

    parser.add_option( "-m", "--method", dest="method", type="choice",
                       choices=("shortest-path", "translate", "components", "add-family" ),
                       help="methods to apply [default=%default].")

    parser.add_option( "-a", "--filename-map", dest="filename_map", type="string",
                       help="filename mapping ids to nids (used for translation) [default=%default].")

    parser.add_option( "-1", "--node1", dest="node1", type="string",
                       help="first node for path calculation [default=%default].")

    parser.add_option( "-2", "--node2", dest="node2", type="string",
                       help="second node for path calculation [default=%default].")

    parser.add_option( "-f", "--filename-families", dest="filename_families", type="string",
                       help="filename with domain families [default=%default].")



    parser.set_defaults( 
        method = None,
        graph_format = "alignments",
        filename_map = None,
        node1 = None,
        node2 = None,
        filename_families = None,
        )

    (options, args) = E.Start( parser, 
                               argv = argv )
            
    if options.filename_families != None:
        E.info( "reading families from %s" % options.filename_families )
        map_domain2family = {}
        for line in open( options.filename_families, "r"):
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            pid = bytes("%s_%s_%s" % (nid,start,end))
            map_domain2family[pid] = bytes(family)
        E.info( "read %i domains" % len(map_domain2family))

    if options.method == "translate":
        
        if options.filename_map:
            E.info("reading map from %s" % options.filename_map)
            map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") )
            map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()])

        def translate_alignments( line ):        
            if line.startswith("passed"): return line
            data = line.split( "\t" )
            
            x = data[1].split("_")
            y = data[2].split("_")
            try:
                data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(x) )
                raise
            try:
                data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(y) )
                raise

            return "\t".join(data)

        if options.graph_format == "alignments":
            translator = translate_alignments
            
        for line in options.stdin:
            if not line.startswith("#"): 
                line = translator( line )
            options.stdout.write(line)
            
        E.Stop()
        return

    elif options.method == "add-family":
        options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields)))
        for link in AddaIO.iterate_tested_links( options.stdin ):
            qfamily = map_domain2family.get(link.qdomain,"na")
            sfamily = map_domain2family.get(link.sdomain,"na")
            options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), 
                                                    qfamily,
                                                    sfamily))
        E.Stop()
        return

    t = time.time()
    if options.graph_format == "alignments":
        map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin )
        
    E.info( "graph read in %i seconds" % (time.time() - t ))
    t = time.time()

    if options.method == "shortest-path":
        E.debug( "shortest path between %s:%i and %s:%i" % \
                     (options.node1,
                      map_vertex2id[options.node1],
                      options.node2,
                      map_vertex2id[options.node2] ) )

        paths = G.get_shortest_paths( map_vertex2id[options.node1],
                                      to = (map_vertex2id[options.node2],)
                                      )
             
        p = paths[map_vertex2id[options.node2]]
        if len(p) == 0: 
            E.info( "no path between %s:%i and %s:%i" % \
                        (options.node1,
                         map_vertex2id[options.node1],
                         options.node2,
                         map_vertex2id[options.node2] ) )

        
        l, last_node = p[0], map_id2vertex[p[0]]
        
        for x in p[1:]:
            node = map_id2vertex[x]
            ei = G.get_eid(x, l)
            
            options.stdout.write( "%s\t%s\t%s\n" %\
                                  (last_node, node, 
                                   G.es[ei]["info"]) ) 
            l, last_node = x, node

    elif options.method == "components":
        print "component\tnode"
        for id, component in enumerate(nx.connected_components( G )):
            for c in component:
                print "%i\t%s" % (id,c)

    E.info( "%s: %i seconds" % (options.method, time.time() - t ))
    E.Stop()
Ejemplo n.º 2
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("view", "align", "pileup", "profile"),
        help="method to perform [default=%default].",
    )

    parser.add_option(
        "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]."
    )

    parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].")

    parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write(
            "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n"
            % (
                nid1,
                nid2,
                result.getScore(),
                result.getLength(),
                result.getNumGaps(),
                result.getRowFrom(),
                result.getRowTo(),
                result.getColFrom(),
                result.getColTo(),
            )
        )

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option("-D", "--database", dest="database", type="string",          
                      help="tablename to use [default=%default]."  )
    
    parser.add_option("-t", "--trees", dest="table_name_trees", type="string",          
                      help="tablename with trees [default=%default]."  )

    parser.add_option("-r", "--parts", dest="table_name_parts", type="string",          
                      help="tablename with trees [default=%default]."  )

    parser.add_option("-b", "--bench", dest="table_name_bench", type="string",          
                      help="domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]."  )

    parser.add_option("-f", "--reference", dest="table_name_reference", type="string",          
                      help="table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]."  )

    parser.add_option( "--bin-size", dest="bin_size", type="int",          
                      help="bin size [default=%default]."  )

    parser.add_option( "-o", "--resolution", dest="resolution", type="float",          
                      help="resolution for scaling of domains [default=%default]."  )

    parser.add_option("-s", "--switch", dest="switch", action = "store_true",
                      help="switch between coverage of reference and size ratio if coverage is 1 [default=%default]."  )

    parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option("-m", "--skip-tms", dest="skip_tms", action = "store_true",
                      help="discard domains which contain transmembrane regions [default=%default]."  )

    parser.add_option("-e", "--check-selection", dest="check_selection", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option("-q", "--quality", dest="quality", action = "store_true",
                      help="take only sequences which are curated [default=%default]."  )

    parser.add_option( "--no-full-length", dest="no_full_length", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option( "--only-full-length", dest="only_full_length", action = "store_true",
                      help="[default=%default]."  )

    parser.add_option( "--check-if-comparable", dest="check_if_comparable", action = "store_true",
                      help="perform comparable check according to Islam95 (default level 85%) [default=%default]."  )

    parser.add_option( "--subset", dest="subset", type = "string",
                       help = "use only a subset of nids [default=%default]" )

    parser.set_defaults( 
        database = "pairsdb",
        table_name_reference = None,
        table_name_trees = None,
        table_name_parts = None,
        table_name_bench = None,
        resolution = None,
        loglevel = 1,
        min_overlap = 1,
        switch = 0,
        combine_repeats = 1,
        skip_repeats = 0,
        skip_tms = 0,
        discard_full_length = 0,
        check_selection = 0,
        selection_threshold = 0.9,
        quality = None,
        no_full_length = None,
        only_full_length = None,
        ## a full length domain should cover at least 90% of a sequence
        min_length_ratio = 0.9,
        check_comparable = None,
        check_comparable_level = 0.85,
        bin_size = 1,
        subset = None )

    (options, args) = E.Start( parser, 
                               argv = argv, 
                               add_output_options = True )

    dbhandle = Pairsdb()
    dbhandle.Connect( dbname =  options.database )
    
    tbl_reference = TableDomains(dbhandle, "generic")
    tbl_reference.SetName(options.table_name_reference)
    
    # tbl_masks = Table_nrdb90_masks(dbhandle)
    tbl_nrdb = Table_nrdb( dbhandle )

    # todo: encapsulate this with a parameter
    tbl_nrdb.name = "nrdb40"

    if options.table_name_trees:

        nids_statement = '''SELECT DISTINCT t.nid 
                            FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\
                         (options.table_name_trees, 
                          options.table_name_reference)

        if options.quality:
            nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'")
        else:
            nids_statement = nids_statement % ("","")
            
        statement = """
        SELECT t.node, t.parent, t.level, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """ 

        tablename = options.table_name_trees
        
    elif options.table_name_parts or options.table_name_bench:

        if options.table_name_parts:
            table_name = options.table_name_parts
        else:
            table_name = options.table_name_bench

        if options.subset:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, %s AS t 
                                WHERE t.nid = s.nid''' % (options.subset, table_name)
        else:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, 
                                     %s AS r %%s 
                                 WHERE r.nid = s.nid %%s''' %\
                             (table_name, options.table_name_reference)

            if options.quality:
                nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'")
            else:
                nids_statement = nids_statement % ("","")

        statement = """
        SELECT 1, 0, 0, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = table_name

    else:
        print "what shall I compare?"
        sys.exit(1)

    if options.check_selection:
        selection_statement = """
        SELECT t.domain_from, t.domain_to,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (t.domain_to - t.domain_from)i) AS cov_dom,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (%(end)i - %(start)i)) AS cov_ref,
        ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(selection_tablename)s AS t
        WHERE t.domain_nid = %(nid)i
        AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """
        selection_tablename = options.table_name_parts

        options.table_name_parts = None
        
        parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees =  0,0,0,0

    min_overlap = options.min_overlap    

    nids = map(lambda x:x[0], dbhandle.Execute(nids_statement).fetchall())

    overlaps = []
    cov_doms = []
    cov_refs = []
    touched  = {}

    if options.check_selection:
        options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" )
    else:
        options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" )

    E.info( "--> processing %i nids" % len(nids) )

    nskipped_no_assignments = 0
    nskipped_no_overlap = 0
    nskipped_wrong_domaintype = 0
    nfound = 0
    
    it = 0
    for nid in nids:

        it += 1

        E.debug( "--> processing %i" % nid )

        domains = tbl_reference.GetDomainBoundariesForNid( nid )

        length = tbl_nrdb.GetLength( nid )
        
        if not domains:
            nskipped_no_assignments +=1
            continue

        if options.no_full_length and len(domains) == 1:
            ## check if domain is actually full length, otherwise keep
            id, domain_from, domain_to = domains[0]
            if float(domain_to-domain_from) / float(length) >= options.min_length_ratio:
                nskipped_wrong_domaintype += 1
                continue
            
        if options.only_full_length:
            if len(domains) == 1:
                id, domain_from, domain_to = domains[0]
                if float(domain_to-domain_from) / float(length) <= options.min_length_ratio:
                    nskipped_wrong_domaintype += 1
                    continue
            else:
                nskipped_wrong_domaintype += 1                
                continue

        nfound += 1
        
        last_id = None
        x = 0

        # iteration over domains in reference
        while x < len(domains):
            
            id, domain_from, domain_to = domains[x]
                
            ##########################################################
            # process repeats
            is_repeat = -1
            
            while x < len(domains) and domains[x][0] == id:
                domain_to = domains[x][2]
                x += 1
                is_repeat += 1

            if options.skip_repeats and is_repeat:
                continue

            # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to):
            #    continue

            ##########################################################
            ## apply resolution
            if options.resolution:
                start = int(float(domain_from-1)/options.resolution)
                end   = int(float(domain_to-1)/options.resolution) + 1
            else:
                start = domain_from
                end   = domain_to

            E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \
                         ( id, domain_from, domain_to, start, end))

            ##########################################################
            ## get best matching domain
            s = statement % locals() 

            if options.loglevel >= 4: print s
            
            result = dbhandle.Execute(s).fetchone()
            
            if not result: continue

            node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result

            key = "%i-%s-%i-%i" % (nid, id, start, end)
            if touched.has_key(key):
                continue
            else:
                touched[key] = 1

            # discard full length domains
            if options.discard_full_length:
                if options.table_name_trees:            
                    if node == 0: continue
                else:
                    if length == end - start: continue
            
            if options.switch and cov_ref == 1.0:
                xcov_ref = rat_ref
            else:
                xcov_ref = cov_ref
                
            # check, if selection did take a domain lower or further up
            if options.check_selection:
                start = (start * 10) + 1
                end   = min(end * 10 + 1, length)

                s = selection_statement % locals()
                result = dbhandle.Execute(s).fetchone()

                if result:
                    parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result


                    if rat_parts > 1.0:
                        parts_larger_than_trees += 1
                        token = ">"
                    elif rat_parts == 1.0:
                        parts_same_as_trees += 1
                        token = "="
                    else:
                        parts_smaller_than_trees += 1
                        token = "<"
                        if rat_parts < options.selection_threshold:
                            parts_much_smaller_than_trees += 1

                    options.stdout.write(string.join(map(str, (nid,
                                                               id, domain_from, domain_to,
                                                               level,
                                                               yfrom, yto,
                                                               parts_from, parts_to,
                                                               overlap, cov_dom, cov_ref, rat_ref, xcov_ref,
                                                               ovl_parts, cov_parts, cov_tree, rat_parts,
                                                               token)), "\t") + "\n")
                    
            else:
                options.stdout.write(string.join(map(str, (nid, node, parent, level, start, end,
                                                           id,
                                                           start, end,
                                                           overlap, cov_dom, cov_ref, 
                                                           rat_ref, xcov_ref)), "\t") + "\n")
                
                overlaps.append( int(overlap * 100) )
                cov_doms.append( int(cov_dom * 100) )
                cov_refs.append( int(xcov_ref * 100) )            


    E.info( "skipped nids because of no overlap with reference: %i" % nskipped_no_overlap )
    E.info( "skipped nids because of no assignments: %i" % nskipped_no_assignments )
    E.info( "skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype)
    E.info( "nids in comparison: %i" % nfound)
        
    if options.check_selection:
        E.info( " parts larger than trees=", parts_larger_than_trees )
        E.info( " parts like trees=", parts_same_as_trees )
        E.info( " parts smaller than trees=", parts_smaller_than_trees )
        E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees )
    else:
        outfile_stats = E.openOutputFile( "stats" )
        outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader())
        outfile_stats.write("overlaps\t%s\n" % str( Stats.Summary( overlaps ) ) )
        outfile_stats.write("domain_coverage\t%s\n" % str( Stats.Summary( cov_doms ) ) )
        outfile_stats.write("reference_coverage\t%s\n" % str( Stats.Summary( cov_refs ) ) )
        outfile_stats.close()

        outfile = E.openOutputFile( "overlaps.histogram" )
        outfile.write( "bin\tcounts\n")
        Histogram.Write(outfile, 
                        Histogram.Calculate( overlaps, 
                                             min_value=0, 
                                             increment=1, 
                                             no_empty_bins = True))
        outfile.close()

        outfile = E.openOutputFile( "domain_coverage.histogram" )
        outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" )
        Histogram.Write(outfile,
                        Histogram.AddRelativeAndCumulativeDistributions(
                        Histogram.Calculate( cov_doms, 
                                             min_value=0, 
                                             increment=options.bin_size, 
                                             no_empty_bins = True)))
        outfile.close()

        outfile = E.openOutputFile( "reference_coverage.histogram" )
        outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" )
        Histogram.Write(outfile,
                        Histogram.AddRelativeAndCumulativeDistributions(
                    Histogram.Calculate( cov_refs, 
                                         min_value=0, 
                                         increment=options.bin_size, 
                                         no_empty_bins = True)))
                        
        outfile.close()
    
    E.Stop()
Ejemplo n.º 4
0
    ## delete old table if it exists
    try:
        cc = dbhandle.cursor()
        cc.execute("DROP TABLE %s" % options.tablename)
        cc.close()
        if options.loglevel >= 1:
            options.stdlog.write( "# existing table %s deleted\n" % options.tablename )
    except error, msg:
        dbhandle.rollback()
    except error, msg:
        pass

    ## create new table
    statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join( columns))

    E.debug( "table create:\n# %s" % (statement ) )
        
    try:
        cc = dbhandle.cursor()
        cc.execute(statement)
        cc.close()
    except error, msg:
        options.stderr.write( "table creation failed: statement=\n  %s\n" % (statement ) )
        raise error, msg

    E.info("table %s created successfully." % options.tablename )
    
    return take, map_column2type, ignored

def main():
Ejemplo n.º 5
0
        cc = dbhandle.cursor()
        cc.execute("DROP TABLE %s" % options.tablename)
        cc.close()
        if options.loglevel >= 1:
            options.stdlog.write("# existing table %s deleted\n" %
                                 options.tablename)
    except error, msg:
        dbhandle.rollback()
    except error, msg:
        pass

    ## create new table
    statement = "CREATE TABLE %s ( %s );" % (options.tablename,
                                             ", ".join(columns))

    E.debug("table create:\n# %s" % (statement))

    try:
        cc = dbhandle.cursor()
        cc.execute(statement)
        cc.close()
    except error, msg:
        options.stderr.write("table creation failed: statement=\n  %s\n" %
                             (statement))
        raise error, msg

    E.info("table %s created successfully." % options.tablename)

    return take, map_column2type, ignored

Ejemplo n.º 6
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
def main(argv=sys.argv):

    parser = optparse.OptionParser(version="%prog version: $Id$",
                                   usage=globals()["__doc__"])

    parser.add_option("-D",
                      "--database",
                      dest="database",
                      type="string",
                      help="tablename to use [default=%default].")

    parser.add_option("-t",
                      "--trees",
                      dest="table_name_trees",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option("-r",
                      "--parts",
                      dest="table_name_parts",
                      type="string",
                      help="tablename with trees [default=%default].")

    parser.add_option(
        "-b",
        "--bench",
        dest="table_name_bench",
        type="string",
        help=
        "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]."
    )

    parser.add_option(
        "-f",
        "--reference",
        dest="table_name_reference",
        type="string",
        help=
        "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]."
    )

    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size [default=%default].")

    parser.add_option(
        "-o",
        "--resolution",
        dest="resolution",
        type="float",
        help="resolution for scaling of domains [default=%default].")

    parser.add_option(
        "-s",
        "--switch",
        dest="switch",
        action="store_true",
        help=
        "switch between coverage of reference and size ratio if coverage is 1 [default=%default]."
    )

    parser.add_option("-k",
                      "--skip-repeats",
                      dest="skip_repeats",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-m",
        "--skip-tms",
        dest="skip_tms",
        action="store_true",
        help=
        "discard domains which contain transmembrane regions [default=%default]."
    )

    parser.add_option("-e",
                      "--check-selection",
                      dest="check_selection",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "-q",
        "--quality",
        dest="quality",
        action="store_true",
        help="take only sequences which are curated [default=%default].")

    parser.add_option("--no-full-length",
                      dest="no_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option("--only-full-length",
                      dest="only_full_length",
                      action="store_true",
                      help="[default=%default].")

    parser.add_option(
        "--check-if-comparable",
        dest="check_if_comparable",
        action="store_true",
        help=
        "perform comparable check according to Islam95 (default level 85%) [default=%default]."
    )

    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="use only a subset of nids [default=%default]")

    parser.set_defaults(
        database="pairsdb",
        table_name_reference=None,
        table_name_trees=None,
        table_name_parts=None,
        table_name_bench=None,
        resolution=None,
        loglevel=1,
        min_overlap=1,
        switch=0,
        combine_repeats=1,
        skip_repeats=0,
        skip_tms=0,
        discard_full_length=0,
        check_selection=0,
        selection_threshold=0.9,
        quality=None,
        no_full_length=None,
        only_full_length=None,
        ## a full length domain should cover at least 90% of a sequence
        min_length_ratio=0.9,
        check_comparable=None,
        check_comparable_level=0.85,
        bin_size=1,
        subset=None)

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    dbhandle = Pairsdb()
    dbhandle.Connect(dbname=options.database)

    tbl_reference = TableDomains(dbhandle, "generic")
    tbl_reference.SetName(options.table_name_reference)

    # tbl_masks = Table_nrdb90_masks(dbhandle)
    tbl_nrdb = Table_nrdb(dbhandle)

    # todo: encapsulate this with a parameter
    tbl_nrdb.name = "nrdb40"

    if options.table_name_trees:

        nids_statement = '''SELECT DISTINCT t.nid 
                            FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\
                         (options.table_name_trees,
                          options.table_name_reference)

        if options.quality:
            nids_statement = nids_statement % (
                ", nrdb_quality AS q",
                "AND q.nid = s.nid AND q.is_curated = 'T'")
        else:
            nids_statement = nids_statement % ("", "")

        statement = """
        SELECT t.node, t.parent, t.level, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = options.table_name_trees

    elif options.table_name_parts or options.table_name_bench:

        if options.table_name_parts:
            table_name = options.table_name_parts
        else:
            table_name = options.table_name_bench

        if options.subset:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, %s AS t 
                                WHERE t.nid = s.nid''' % (options.subset,
                                                          table_name)
        else:
            nids_statement = '''SELECT DISTINCT s.nid 
                                FROM %s AS s, 
                                     %s AS r %%s 
                                 WHERE r.nid = s.nid %%s''' %\
                             (table_name, options.table_name_reference)

            if options.quality:
                nids_statement = nids_statement % (
                    ", nrdb_quality AS q",
                    "AND q.nid = s.nid AND q.is_curated = 'T'")
            else:
                nids_statement = nids_statement % ("", "")

        statement = """
        SELECT 1, 0, 0, t.start, t.end,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (t.end - t.start)) AS cov_dom,
        ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / 
               (%(end)i - %(start)i)) AS cov_ref,
        ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(tablename)s AS t
        WHERE t.nid = %(nid)i
        AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """

        tablename = table_name

    else:
        print "what shall I compare?"
        sys.exit(1)

    if options.check_selection:
        selection_statement = """
        SELECT t.domain_from, t.domain_to,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (t.domain_to - t.domain_from)i) AS cov_dom,
        ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / 
           (%(end)i - %(start)i)) AS cov_ref,
        ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref
        FROM %(selection_tablename)s AS t
        WHERE t.domain_nid = %(nid)i
        AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i)
        ORDER BY ovl DESC
        LIMIT 1
        """
        selection_tablename = options.table_name_parts

        options.table_name_parts = None

        parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0

    min_overlap = options.min_overlap

    nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall())

    overlaps = []
    cov_doms = []
    cov_refs = []
    touched = {}

    if options.check_selection:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )
    else:
        options.stdout.write(
            "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n"
        )

    E.info("--> processing %i nids" % len(nids))

    nskipped_no_assignments = 0
    nskipped_no_overlap = 0
    nskipped_wrong_domaintype = 0
    nfound = 0

    it = 0
    for nid in nids:

        it += 1

        E.debug("--> processing %i" % nid)

        domains = tbl_reference.GetDomainBoundariesForNid(nid)

        length = tbl_nrdb.GetLength(nid)

        if not domains:
            nskipped_no_assignments += 1
            continue

        if options.no_full_length and len(domains) == 1:
            ## check if domain is actually full length, otherwise keep
            id, domain_from, domain_to = domains[0]
            if float(domain_to -
                     domain_from) / float(length) >= options.min_length_ratio:
                nskipped_wrong_domaintype += 1
                continue

        if options.only_full_length:
            if len(domains) == 1:
                id, domain_from, domain_to = domains[0]
                if float(domain_to - domain_from) / float(
                        length) <= options.min_length_ratio:
                    nskipped_wrong_domaintype += 1
                    continue
            else:
                nskipped_wrong_domaintype += 1
                continue

        nfound += 1

        last_id = None
        x = 0

        # iteration over domains in reference
        while x < len(domains):

            id, domain_from, domain_to = domains[x]

            ##########################################################
            # process repeats
            is_repeat = -1

            while x < len(domains) and domains[x][0] == id:
                domain_to = domains[x][2]
                x += 1
                is_repeat += 1

            if options.skip_repeats and is_repeat:
                continue

            # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to):
            #    continue

            ##########################################################
            ## apply resolution
            if options.resolution:
                start = int(float(domain_from - 1) / options.resolution)
                end = int(float(domain_to - 1) / options.resolution) + 1
            else:
                start = domain_from
                end = domain_to

            E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \
                         ( id, domain_from, domain_to, start, end))

            ##########################################################
            ## get best matching domain
            s = statement % locals()

            if options.loglevel >= 4: print s

            result = dbhandle.Execute(s).fetchone()

            if not result: continue

            node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result

            key = "%i-%s-%i-%i" % (nid, id, start, end)
            if touched.has_key(key):
                continue
            else:
                touched[key] = 1

            # discard full length domains
            if options.discard_full_length:
                if options.table_name_trees:
                    if node == 0: continue
                else:
                    if length == end - start: continue

            if options.switch and cov_ref == 1.0:
                xcov_ref = rat_ref
            else:
                xcov_ref = cov_ref

            # check, if selection did take a domain lower or further up
            if options.check_selection:
                start = (start * 10) + 1
                end = min(end * 10 + 1, length)

                s = selection_statement % locals()
                result = dbhandle.Execute(s).fetchone()

                if result:
                    parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result

                    if rat_parts > 1.0:
                        parts_larger_than_trees += 1
                        token = ">"
                    elif rat_parts == 1.0:
                        parts_same_as_trees += 1
                        token = "="
                    else:
                        parts_smaller_than_trees += 1
                        token = "<"
                        if rat_parts < options.selection_threshold:
                            parts_much_smaller_than_trees += 1

                    options.stdout.write(
                        string.join(
                            map(str, (nid, id, domain_from, domain_to, level,
                                      yfrom, yto, parts_from, parts_to,
                                      overlap, cov_dom, cov_ref, rat_ref,
                                      xcov_ref, ovl_parts, cov_parts, cov_tree,
                                      rat_parts, token)), "\t") + "\n")

            else:
                options.stdout.write(
                    string.join(
                        map(str, (nid, node, parent, level, start, end, id,
                                  start, end, overlap, cov_dom, cov_ref,
                                  rat_ref, xcov_ref)), "\t") + "\n")

                overlaps.append(int(overlap * 100))
                cov_doms.append(int(cov_dom * 100))
                cov_refs.append(int(xcov_ref * 100))

    E.info("skipped nids because of no overlap with reference: %i" %
           nskipped_no_overlap)
    E.info("skipped nids because of no assignments: %i" %
           nskipped_no_assignments)
    E.info("skipped nids because of wrong domain type: %i" %
           nskipped_wrong_domaintype)
    E.info("nids in comparison: %i" % nfound)

    if options.check_selection:
        E.info(" parts larger than trees=", parts_larger_than_trees)
        E.info(" parts like trees=", parts_same_as_trees)
        E.info(" parts smaller than trees=", parts_smaller_than_trees)
        E.info(
            " parts much smaller than trees (<%f)=" %
            options.selection_threshold, parts_much_smaller_than_trees)
    else:
        outfile_stats = E.openOutputFile("stats")
        outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader())
        outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps)))
        outfile_stats.write("domain_coverage\t%s\n" %
                            str(Stats.Summary(cov_doms)))
        outfile_stats.write("reference_coverage\t%s\n" %
                            str(Stats.Summary(cov_refs)))
        outfile_stats.close()

        outfile = E.openOutputFile("overlaps.histogram")
        outfile.write("bin\tcounts\n")
        Histogram.Write(
            outfile,
            Histogram.Calculate(overlaps,
                                min_value=0,
                                increment=1,
                                no_empty_bins=True))
        outfile.close()

        outfile = E.openOutputFile("domain_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_doms,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))
        outfile.close()

        outfile = E.openOutputFile("reference_coverage.histogram")
        outfile.write(
            "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n"
        )
        Histogram.Write(
            outfile,
            Histogram.AddRelativeAndCumulativeDistributions(
                Histogram.Calculate(cov_refs,
                                    min_value=0,
                                    increment=options.bin_size,
                                    no_empty_bins=True)))

        outfile.close()

    E.Stop()