def startUp( self ): if self.isComplete(): return ############################################### # create objects for algorithm alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) self.mLogOddor = alignlib.makeLogOddorDirichlet( self.mScaleFactor ) self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed() self.mWeightor = alignlib.makeWeightor() alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor ) alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor ) alignlib.getDefaultToolkit().setWeightor( self.mWeightor ) if self.mUsePrebuiltProfiles: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" ) self.mProfileLibrary.setWeightor( self.mWeightor ) self.mProfileLibrary.setLogOddor( self.mLogOddor ) self.mProfileLibrary.setRegularizor( self.mRegularizor ) else: self.mProfileLibrary = None self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex ) self.mChecker = self.checkLinkZScore self.mHeader = ("qdomain", "sdomain", "weight", "passed", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore" ) self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, self.mGop, self.mGep ) # the cache to store alignandum objects self.mCache = {} alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) ## initialize counters self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0 self.mOutfile = self.openOutputStream( self.mFilenameAlignments ) if self.mContinueAt == None: self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) self.mOutfile.flush() self.mStartTime = time.time()
def apply(self, argv): (filename, chunk, nchunks, options, module, config, kwargs) = argv L.info("chunk %i: setting up" % (chunk)) if self.mLoadMapNid2Domains and self.mMapNid2Domains == None: # load all maps that were not inherited from the parent process L.info("opening map_nid2domains from cache") self.mMapNid2Domains = shelve.open( config.get("files", "storage_domains", "memory"), "r") # build the modules if module(config=config, fasta=self.mFasta).isComplete(): L.info("chunk %i is complete" % (chunk)) return module = module(config=config, num_chunks=nchunks, chunk=chunk, fasta=self.mFasta, map_id2nid=self.mMapId2Nid, map_nid2domains=self.mMapNid2Domains, **kwargs) if module.isComplete(): L.info("chunk %i is complete" % (chunk, )) return L.info("chunk %i: starting" % (chunk, )) module.startUp() # find out nids to work with nids = map(int, self.mFasta.keys()) nids.sort() increment = int(math.ceil(len(nids) / float(nchunks))) start = chunk * increment nids = nids[start:start + increment] L.info("chunk %i: starting work on %i nids from %s to %s" % (chunk, len(nids), str(nids[0]), str(nids[-1]))) index = cadda.IndexedNeighbours(self.mFilenameGraph, self.mFilenameIndex) iteration = 0 for nid in nids: iteration += 1 neighbours = index.getNeighbours(nid) L.info( "chunk %i: started nid=%s, neighbours=%i, progress=%i/%i (%5.1f%%)" % (chunk, str(nid), len(neighbours), iteration, len(nids), 100.0 * iteration / len(nids))) if neighbours: module.run(AddaIO.NeighboursRecord(nid, neighbours)) L.info( "chunk %i: finished nid=%s, neighbours=%i, progress=%i/%i (%5.1f%%)" % (chunk, str(nid), len(neighbours), iteration, len(nids), 100.0 * iteration / len(nids))) if options.test and iteration >= options.test: break L.info("chunk %i: finished" % (chunk, )) module.finish() L.info("chunk %i: finished %i nids" % (chunk, len(nids)))
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
(options, args) = E.Start(parser) if len(args) == 0: raise ValueError("please supply one or more nids to test.") config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) filename_graph = config.get("files", "output_graph", "adda.graph") filename_index = config.get("files", "output_index", "adda.graph.index") filename_fasta = config.get("files", "output_fasta", "adda") fasta = IndexedFasta.IndexedFasta(filename_fasta) index = cadda.IndexedNeighbours(filename_graph, filename_index) config.set("files", "output_segments", "test.segments") module = AddaSegment( config=config, fasta=fasta, ) module.startUp() args = map(int, args) for nid in args: neighbours = index.getNeighbours(nid) module.applyMethod(AddaIO.NeighboursRecord(nid, neighbours))
def applyMethod(self ): """compute stats """ if self.isComplete(): return self.info( "counting sequence lengths" ) outfile = self.openOutputStream( self.mFilenameStats ) outfile.write("category\tcounts\tmean\tmedian\n" ) outfile_nids = self.openOutputStream( self.mFilenameStatsSequences ) outfile_nids.write( "nid\tlength\tneighbours\n" ) # plot length distribution lengths = self.mFasta.getContigSizes() hist, bins = numpy.histogram( lengths.values(), bins = numpy.arange(0, 40000, 1) ) AddaPlot.plotHistogram( bins[:-1], hist, title = "distribution of sequence lengths", filename = self.mFilenameStats + "_lengths.png", xlabel = "length", ylabel = "frequency", logscale = "xy" ) outfile.write( "%s\t%i\t%f\t%f\n" % ( "lengths", len(lengths), numpy.mean( lengths.values() ), numpy.median( lengths.values() ) ) ) self.info( "counting neighbourhoods" ) # do neighbour distribution index = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex ) neighbours = [] for nid in self.mFasta.keys(): n = len(index.getNeighbours( nid )) neighbours.append( n ) outfile_nids.write( "%i\t%i\t%i\n" % (nid, lengths[nid], n )) hist, bins = numpy.histogram( neighbours, bins = numpy.arange(0, 40000, 1) ) AddaPlot.plotHistogram( bins[:-1], hist, title = "distribution of neighbourhood sizes", filename = self.mFilenameStats + "_neighbours.png", xlabel = "neighbours", ylabel = "frequency", logscale = "xy" ) outfile.write( "%s\t%i\t%f\t%f\n" % ( "neighours", len(neighbours), numpy.mean( neighbours ), numpy.median( neighbours ) ) ) outfile.close() outfile_nids.close()