Exemple #1
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("view", "align", "pileup", "profile"),
        help="method to perform [default=%default].",
    )

    parser.add_option(
        "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]."
    )

    parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].")

    parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write(
            "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n"
            % (
                nid1,
                nid2,
                result.getScore(),
                result.getLength(),
                result.getNumGaps(),
                result.getRowFrom(),
                result.getRowTo(),
                result.getColFrom(),
                result.getColTo(),
            )
        )

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
def annotateAlignmentGraph(infile, outfiles):
    '''input the alignment graph and output
    a translated version of it and adding 
    reference domain information.
    '''

    outfile, outfile_stats = outfiles

    # collect benchmark domains
    E.info("reading benchmark domains")
    benchmark_domains = AddaIO.readMapNid2Domains(
        gzip.open(PARAMS["eval_filename_benchmark_domains"]))

    totuple = AddaIO.toTuple
    toDomain = AddaIO.toDomain
    # build map of id to nid
    E.info("reading map between pid and nid")
    map_nid2pid = AddaIO.readMapPid2Nid(
        open(PARAMS["eval_filename_adda_nids"], "r"))

    def getOverlappingDomains(pid, start, end):
        '''get domains overlapping pid:start..end'''
        if pid not in benchmark_domains: return ()
        # greedy overlap testing
        r = []
        for family, domains in benchmark_domains[pid].iteritems():
            for other_start, other_end in domains:
                if start >= other_end or end <= other_start: continue
                r.append((family, other_start, other_end))
        return r

    counts = E.Counter()

    if infile.endswith(".gz"):
        inf = gzip.open(infile)
    else:
        inf = open(infile)

    outf = gzip.open(outfile, "w")

    outf.write("%s\n" % "\t".join(
        ("passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali",
         "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore",
         "rfamilies", "sfamilies", "rdomains", "sdomains")))

    # counts for true positives, false positives and unknown
    n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0

    outf_stats = open(outfile_stats, "w")
    outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n")
    last_weight = None

    for link in AddaIO.iterate_tested_links(inf):
        qnid, qstart, qend = totuple(link.qdomain)
        snid, sstart, send = totuple(link.sdomain)
        qpid = map_nid2pid[qnid]
        spid = map_nid2pid[snid]
        qfamily = sorted(getOverlappingDomains(qpid, qstart, qend))
        sfamily = sorted(getOverlappingDomains(spid, sstart, send))

        passed = link.passed == "+"
        n += 1

        if not qfamily and not sfamily:
            uk += 1
        else:
            qf = set([x[0] for x in qfamily])
            sf = set([x[0] for x in sfamily])
            if qf.intersection(sf):
                if passed: tp += 1
                else: fn += 1
            else:
                if passed: fp += 1
                else: tn += 1

        weight = round(float(link.weight))
        if weight != last_weight:
            if last_weight != None:
                outf_stats.write("\t".join(
                    map(str, (
                        last_weight,
                        n,
                        tp,
                        fp,
                        fn,
                        tn,
                        uk,
                        float(tp) / (tp + fp),
                        float(fn) / (fn + tn + 0.00001),
                    ))) + "\n")

            last_weight = weight

        if passed: counts.passed += 1
        else: counts.failed += 1

        link = link._replace(qdomain=toDomain((qpid, qstart, qend)),
                             sdomain=toDomain((spid, sstart, send)))

        outf.write( "%s\t%s\t%s\t%s\t%s\n" % \
                        ("\t".join( map(str,link) ),
                         ",".join( sorted(set([x[0] for x in qfamily])) ),
                         ",".join( sorted(set([x[0] for x in sfamily])) ),
                         ",".join("%s_%i_%i" % x for x in qfamily ),
                         ",".join("%s_%i_%i" % x for x in sfamily )))
    inf.close()
    outf_stats.write("\t".join(
        map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) /
                  (tp + fp), float(fn) / (fn + tn)))) + "\n")

    outf_stats.close()
    E.info("%s" % str(counts))
Exemple #3
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--format", dest="graph-format", type="choice",
                       choices=("alignments",),
                       help="graph format [default=%default].")

    parser.add_option( "-m", "--method", dest="method", type="choice",
                       choices=("shortest-path", "translate", "components", "add-family" ),
                       help="methods to apply [default=%default].")

    parser.add_option( "-a", "--filename-map", dest="filename_map", type="string",
                       help="filename mapping ids to nids (used for translation) [default=%default].")

    parser.add_option( "-1", "--node1", dest="node1", type="string",
                       help="first node for path calculation [default=%default].")

    parser.add_option( "-2", "--node2", dest="node2", type="string",
                       help="second node for path calculation [default=%default].")

    parser.add_option( "-f", "--filename-families", dest="filename_families", type="string",
                       help="filename with domain families [default=%default].")



    parser.set_defaults( 
        method = None,
        graph_format = "alignments",
        filename_map = None,
        node1 = None,
        node2 = None,
        filename_families = None,
        )

    (options, args) = E.Start( parser, 
                               argv = argv )
            
    if options.filename_families != None:
        E.info( "reading families from %s" % options.filename_families )
        map_domain2family = {}
        for line in open( options.filename_families, "r"):
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            pid = bytes("%s_%s_%s" % (nid,start,end))
            map_domain2family[pid] = bytes(family)
        E.info( "read %i domains" % len(map_domain2family))

    if options.method == "translate":
        
        if options.filename_map:
            E.info("reading map from %s" % options.filename_map)
            map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") )
            map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()])

        def translate_alignments( line ):        
            if line.startswith("passed"): return line
            data = line.split( "\t" )
            
            x = data[1].split("_")
            y = data[2].split("_")
            try:
                data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(x) )
                raise
            try:
                data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2])
            except KeyError:
                sys.stderr.write("could not map: %s\n" % str(y) )
                raise

            return "\t".join(data)

        if options.graph_format == "alignments":
            translator = translate_alignments
            
        for line in options.stdin:
            if not line.startswith("#"): 
                line = translator( line )
            options.stdout.write(line)
            
        E.Stop()
        return

    elif options.method == "add-family":
        options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields)))
        for link in AddaIO.iterate_tested_links( options.stdin ):
            qfamily = map_domain2family.get(link.qdomain,"na")
            sfamily = map_domain2family.get(link.sdomain,"na")
            options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), 
                                                    qfamily,
                                                    sfamily))
        E.Stop()
        return

    t = time.time()
    if options.graph_format == "alignments":
        map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin )
        
    E.info( "graph read in %i seconds" % (time.time() - t ))
    t = time.time()

    if options.method == "shortest-path":
        E.debug( "shortest path between %s:%i and %s:%i" % \
                     (options.node1,
                      map_vertex2id[options.node1],
                      options.node2,
                      map_vertex2id[options.node2] ) )

        paths = G.get_shortest_paths( map_vertex2id[options.node1],
                                      to = (map_vertex2id[options.node2],)
                                      )
             
        p = paths[map_vertex2id[options.node2]]
        if len(p) == 0: 
            E.info( "no path between %s:%i and %s:%i" % \
                        (options.node1,
                         map_vertex2id[options.node1],
                         options.node2,
                         map_vertex2id[options.node2] ) )

        
        l, last_node = p[0], map_id2vertex[p[0]]
        
        for x in p[1:]:
            node = map_id2vertex[x]
            ei = G.get_eid(x, l)
            
            options.stdout.write( "%s\t%s\t%s\n" %\
                                  (last_node, node, 
                                   G.es[ei]["info"]) ) 
            l, last_node = x, node

    elif options.method == "components":
        print "component\tnode"
        for id, component in enumerate(nx.connected_components( G )):
            for c in component:
                print "%i\t%s" % (id,c)

    E.info( "%s: %i seconds" % (options.method, time.time() - t ))
    E.Stop()
Exemple #4
0
def main():
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", 
                                    usage = globals()["__doc__"])

    parser.add_option( "-n", "--nids", dest="filename_nids", type="string",
                       help="filename with nids[default=%default].")

    parser.add_option( "-c", "--column", dest="columns", type="int", action="append",
                       help="columns with nids to translate (1-based) [default=%default].")

    parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true",
                       help="translate domain ids [default=%default].")

    parser.add_option( "-i", "--invert", dest="invert", action="store_true",
                       help="invert mapping [default=%default].")

    parser.add_option( "-e", "--no-header", dest="no_header", action="store_true",
                       help="file has no header [default=%default].")

    parser.set_defaults( 
        filename_nids = "adda.nids",
        columns = [],
        is_domains = False,
        invert = False,
        noheader = False,
        )
    
    (options, args) = E.Start( parser )
    
    map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") )
    if options.invert:
        E.info( "inverting mapping" )
        map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] )

    if len(options.columns) == 0: options.columns = [1]
    columns = [x-1 for x in options.columns ]

    toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain
    first = not options.no_header
    is_domains = options.is_domains
    ninput, noutput, nskipped = 0, 0, 0
    for line in options.stdin:
        if line.startswith("#"):
            options.stdout.write(line)
            continue

        if first:
            options.stdout.write(line)
            first = False
            continue
        
        ninput += 1

        data = line[:-1].split("\t")
        for x in columns:
            if is_domains:
                try:
                    d = toTuple(data[x])
                except ValueError:
                    E.warn( "could not parse domain `%s`" % data[x])
                    nskipped += 1
                    break

                try:
                    data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) )
                except (IndexError, KeyError):
                    E.warn( "could not map domain `%s`" % data[x])
                    nskipped += 1
                    break
            else:
                try:
                    data[x] = str(map_nid2pid[int(data[x])])
                except IndexError:
                    E.warn( "could not map nid `%s`" % data[x])
                    nskipped += 1
                    break
        else:
            options.stdout.write("%s\n" % "\t".join(data))
            noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
    E.Stop()
Exemple #5
0
def annotateAlignmentGraph( infile, outfiles ):
    '''input the alignment graph and output
    a translated version of it and adding 
    reference domain information.
    '''

    outfile, outfile_stats = outfiles

    # collect benchmark domains 
    E.info( "reading benchmark domains" )
    benchmark_domains = AddaIO.readMapNid2Domains( 
        gzip.open( PARAMS["eval_filename_benchmark_domains"] ) )

    totuple = AddaIO.toTuple
    toDomain = AddaIO.toDomain
    # build map of id to nid
    E.info( "reading map between pid and nid" )
    map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r") )

    def getOverlappingDomains( pid, start, end ):
        '''get domains overlapping pid:start..end'''
        if pid not in benchmark_domains: return ()
        # greedy overlap testing
        r = []
        for family, domains in benchmark_domains[pid].iteritems():
            for other_start, other_end in domains:
                if start >= other_end or end <= other_start: continue
                r.append( (family, other_start, other_end) )
        return r

    counts = E.Counter()
    
    if infile.endswith(".gz"):
        inf = gzip.open( infile )
    else:
        inf = open(infile)

    outf = gzip.open( outfile, "w" )
    
    outf.write( "%s\n" % "\t".join( ( "passed",
                                      "qdomain",
                                      "sdomain",
                                      "weight",
                                      "qstart",
                                      "qend",
                                      "qali",
                                      "sstart",
                                      "send",
                                      "sali",
                                      "score",
                                      "naligned",
                                      "ngaps",
                                      "zscore",
                                      "rfamilies",
                                      "sfamilies",
                                      "rdomains",
                                      "sdomains")) )

    
    # counts for true positives, false positives and unknown
    n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0
            
    outf_stats = open( outfile_stats, "w" )
    outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n" )
    last_weight = None

    for link in AddaIO.iterate_tested_links( inf ):
        qnid, qstart, qend = totuple(link.qdomain)
        snid, sstart, send = totuple(link.sdomain)
        qpid = map_nid2pid[qnid]
        spid = map_nid2pid[snid]
        qfamily = sorted(getOverlappingDomains( qpid, qstart, qend ))
        sfamily = sorted(getOverlappingDomains( spid, sstart, send ))

        passed = link.passed == "+"
        n += 1

        if not qfamily and not sfamily:
            uk += 1
        else:
            qf = set( [x[0] for x in qfamily] )
            sf = set( [x[0] for x in sfamily] )
            if qf.intersection( sf ):
                if passed: tp += 1
                else: fn += 1
            else:
                if passed: fp += 1
                else: tn += 1
        
        weight = round(float(link.weight))
        if weight != last_weight:
            if last_weight != None:
                outf_stats.write( "\t".join( map(str, (last_weight,
                                                       n,
                                                       tp, fp, fn, tn, uk,
                                                       float(tp) / (tp+fp),
                                                       float(fn) / (fn+tn+0.00001),
                                                       ) ) ) + "\n" )
                                                   
            last_weight = weight

        if passed: counts.passed += 1
        else: counts.failed += 1

        link = link._replace( qdomain=toDomain( (qpid, qstart, qend) ),
                              sdomain=toDomain( (spid, sstart, send) ))

        outf.write( "%s\t%s\t%s\t%s\t%s\n" % \
                        ("\t".join( map(str,link) ), 
                         ",".join( sorted(set([x[0] for x in qfamily])) ),
                         ",".join( sorted(set([x[0] for x in sfamily])) ),
                         ",".join("%s_%i_%i" % x for x in qfamily ),
                         ",".join("%s_%i_%i" % x for x in sfamily )))
    inf.close()
    outf_stats.write( "\t".join( map(str, (last_weight,
                                           n,
                                           tp, fp, fn, tn, uk,
                                           float(tp) / (tp+fp),
                                           float(fn) / (fn+tn) ) ) ) + "\n" )
    
    outf_stats.close()
    E.info( "%s" % str( counts ) )
Exemple #6
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()