예제 #1
0
    def startUp( self ):

        if self.isComplete(): return

        ###############################################
        # create objects for algorithm 
        alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
        self.mLogOddor    = alignlib.makeLogOddorDirichlet( self.mScaleFactor )
        self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed()
        self.mWeightor    = alignlib.makeWeightor()

        alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor )
        alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor )
        alignlib.getDefaultToolkit().setWeightor( self.mWeightor )


        if self.mUsePrebuiltProfiles:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" )
            self.mProfileLibrary.setWeightor( self.mWeightor )
            self.mProfileLibrary.setLogOddor( self.mLogOddor )
            self.mProfileLibrary.setRegularizor( self.mRegularizor )

        else:
            self.mProfileLibrary = None
            self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex )

        self.mChecker = self.checkLinkZScore
        self.mHeader = ("qdomain",
                        "sdomain",
                        "weight",
                        "passed",
                        "qstart",
                        "qend",
                        "qali",
                        "sstart",
                        "send",
                        "sali",
                        "score",
                        "naligned",
                        "ngaps",
                        "zscore" )

        self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                        self.mGop,
                                                        self.mGep )

        # the cache to store alignandum objects
        self.mCache = {}        
        
        alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )

        ## initialize counters
        self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0

        self.mOutfile = self.openOutputStream( self.mFilenameAlignments )

        if self.mContinueAt == None:
            self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) 
            self.mOutfile.flush()

        self.mStartTime = time.time()
예제 #2
0
    def apply(self, argv):

        (filename, chunk, nchunks, options, module, config, kwargs) = argv

        L.info("chunk %i: setting up" % (chunk))

        if self.mLoadMapNid2Domains and self.mMapNid2Domains == None:
            # load all maps that were not inherited from the parent process
            L.info("opening map_nid2domains from cache")
            self.mMapNid2Domains = shelve.open(
                config.get("files", "storage_domains", "memory"), "r")

        # build the modules
        if module(config=config, fasta=self.mFasta).isComplete():
            L.info("chunk %i is complete" % (chunk))
            return

        module = module(config=config,
                        num_chunks=nchunks,
                        chunk=chunk,
                        fasta=self.mFasta,
                        map_id2nid=self.mMapId2Nid,
                        map_nid2domains=self.mMapNid2Domains,
                        **kwargs)

        if module.isComplete():
            L.info("chunk %i is complete" % (chunk, ))
            return

        L.info("chunk %i: starting" % (chunk, ))
        module.startUp()

        # find out nids to work with
        nids = map(int, self.mFasta.keys())
        nids.sort()
        increment = int(math.ceil(len(nids) / float(nchunks)))
        start = chunk * increment
        nids = nids[start:start + increment]

        L.info("chunk %i: starting work on %i nids from %s to %s" %
               (chunk, len(nids), str(nids[0]), str(nids[-1])))

        index = cadda.IndexedNeighbours(self.mFilenameGraph,
                                        self.mFilenameIndex)

        iteration = 0
        for nid in nids:
            iteration += 1
            neighbours = index.getNeighbours(nid)

            L.info(
                "chunk %i: started nid=%s, neighbours=%i, progress=%i/%i (%5.1f%%)"
                % (chunk, str(nid), len(neighbours), iteration, len(nids),
                   100.0 * iteration / len(nids)))

            if neighbours:
                module.run(AddaIO.NeighboursRecord(nid, neighbours))

            L.info(
                "chunk %i: finished nid=%s, neighbours=%i, progress=%i/%i (%5.1f%%)"
                % (chunk, str(nid), len(neighbours), iteration, len(nids),
                   100.0 * iteration / len(nids)))

            if options.test and iteration >= options.test:
                break

        L.info("chunk %i: finished" % (chunk, ))

        module.finish()

        L.info("chunk %i: finished  %i nids" % (chunk, len(nids)))
예제 #3
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
예제 #4
0
    (options, args) = E.Start(parser)

    if len(args) == 0:
        raise ValueError("please supply one or more nids to test.")

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    filename_graph = config.get("files", "output_graph", "adda.graph")
    filename_index = config.get("files", "output_index", "adda.graph.index")
    filename_fasta = config.get("files", "output_fasta", "adda")

    fasta = IndexedFasta.IndexedFasta(filename_fasta)

    index = cadda.IndexedNeighbours(filename_graph, filename_index)

    config.set("files", "output_segments", "test.segments")

    module = AddaSegment(
        config=config,
        fasta=fasta,
    )

    module.startUp()

    args = map(int, args)

    for nid in args:
        neighbours = index.getNeighbours(nid)
        module.applyMethod(AddaIO.NeighboursRecord(nid, neighbours))
예제 #5
0
    def applyMethod(self ):
        """compute stats
        """

        if self.isComplete(): return
        
        self.info( "counting sequence lengths" )
        
        outfile = self.openOutputStream( self.mFilenameStats )
        outfile.write("category\tcounts\tmean\tmedian\n" )

        outfile_nids = self.openOutputStream( self.mFilenameStatsSequences )
        outfile_nids.write( "nid\tlength\tneighbours\n" )

        # plot length distribution
        lengths = self.mFasta.getContigSizes()
        hist, bins = numpy.histogram( lengths.values(),
                                      bins = numpy.arange(0, 40000, 1) )
        
        AddaPlot.plotHistogram( bins[:-1], hist, 
                                title = "distribution of sequence lengths",
                                filename = self.mFilenameStats + "_lengths.png",
                                xlabel = "length",
                                ylabel = "frequency",
                                logscale = "xy" )
                                

        outfile.write( "%s\t%i\t%f\t%f\n" % (
                "lengths",
                len(lengths),
                numpy.mean( lengths.values() ),
                numpy.median( lengths.values() ) ) )

        self.info( "counting neighbourhoods" )

        # do neighbour distribution
        index = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex )

        neighbours = []
        for nid in self.mFasta.keys():
            n = len(index.getNeighbours( nid )) 
            neighbours.append( n )
            outfile_nids.write( "%i\t%i\t%i\n" % (nid, 
                                                  lengths[nid],
                                                  n ))
                                                 
                                                 

        hist, bins = numpy.histogram( neighbours,
                                      bins = numpy.arange(0, 40000, 1) )
        
        AddaPlot.plotHistogram( bins[:-1], hist, 
                                title = "distribution of neighbourhood sizes",
                                filename = self.mFilenameStats + "_neighbours.png",
                                xlabel = "neighbours",
                                ylabel = "frequency",
                                logscale = "xy" )
        
        outfile.write( "%s\t%i\t%f\t%f\n" % (
                "neighours",
                len(neighbours),
                numpy.mean( neighbours ),
                numpy.median( neighbours ) ) )
        
        outfile.close()
        outfile_nids.close()