def applyMethod( self, line ): """output the graph.""" # ignore header if line.startswith("passed"): return self.mInput += 1 link = AddaIO.TestedLink._make( line[:-1].split("\t") ) if self.mInput % self.mReportStep == 0: t = time.time() self.info( "iteration=%i, passed=%i, failed=%i, skipped=%i, notfound=%i, total time=%i, time per step=%f" %\ (self.mInput, self.mNPassed, self.mNFailed, self.mNSkipped, self.mNNotFound, t - self.mStartTime, float(self.mReportStep * ( t - self.mStartTime )) / self.mInput, ) ) if link.passed == "+": self.mOutfile.write( line ) self.mNPassed += 1 self.mNSkipped += 1 self.mOutput += 1 return query_nid, query_from, query_to = AddaIO.toTuple( link.qdomain ) sbjct_nid, sbjct_from, sbjct_to = AddaIO.toTuple( link.sdomain ) self.debug( "checking link between %i (%i-%i) and %i (%i-%i)" %\ (query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to) ) passed, alignment, extra_info = self.mChecker( query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to) if passed: code = "+" self.mNPassed += 1 else: code = "-" self.mNFailed += 1 self.mOutfile.write( "\t".join( ( link.qdomain, link.sdomain, link.weight, code, str(alignlib.AlignmentFormatEmissions( alignment )), str(alignment.getScore()), str(alignment.getNumAligned()), str(alignment.getNumGaps())) + extra_info ) + "\n" ) self.mOutfile.flush() self.mOutput += 1
def startUp(self): if self.isComplete(): return self.mOutfile = self.openOutputStream( self.mFilenameDomains ) self.mOutfileFamilies = self.openOutputStream( self.mFilenameFamilies ) self.mMapId2Nid = AddaIO.readMapId2Nid( open(self.mFilenamesNids, "r") ) self.mMapNid2Id = dict( ( (x[1],x[0]) for x in self.mMapId2Nid.iteritems() ) )
def startUp(self): if self.isComplete(): return self.mOutfile = self.openOutputStream(self.mFilenameOutput) self.mMapId2Nid = AddaIO.readMapId2Nid(open(self.mFilenamesNids, "r")) self.mMapNid2Id = dict( ((x[1], x[0]) for x in self.mMapId2Nid.iteritems()))
def merge(self): """merge runs from parallel computations. returns true if merging was succecss. """ if self.isComplete(): return infiles = glob.glob("%s*" % self.mFilenameProfile) # remove suffixes infiles = list( set([x[:-4] for x in infiles if x != self.mFilenameProfile])) infiles.sort() last_nid = None found = set() ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0 tokens = set(self.mFasta.keys()) self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile, "w") for filename in infiles: infile = ProfileLibrary.ProfileLibrary(filename, "r") for nid, profile in infile.iteritems_sorted(): ninput += 1 if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 self.mProfileLibrary.add(nid, profile) noutput += 1 missing = tokens.difference(found) if len(missing) > 0: self.warn("the following nids were missing: %s" % str(missing)) self.info("adding %i missing nids" % len(missing)) for nid in missing: self.applyMethod(AddaIO.NeighboursRecord(nid, [])) self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) self.info("deleting %i parts" % len(infiles)) for infile in infiles: fn, fi = ProfileLibrary.getFileNames(infile) os.remove(fn) os.remove(fi) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def applyMethod(self ): """index the graph. """ self.info( "indexing of %s started" % self.mFilenameInputGraph ) self.info( "loading map_id2nid from %s" % self.mConfig.get( "files", "output_nids", "adda.nids" )) infile = open( self.mConfig.get( "files", "output_nids", "adda.nids" ) ) map_id2nid = AddaIO.readMapId2Nid( infile, storage = self.mConfig.get( "files", "storage_nids", "memory" ) ) infile.close() infile = AddaIO.openStream( self.mFilenameInputGraph ) cadda.indexGraph( cadda.PairsDBNeighboursIterator( self.mGraphIterator( infile, map_id2nid, self.mLogger ), self.mLogger ), len(map_id2nid), self.mFilenameOutputGraph, self.mFilenameOutputIndex, self.mLogger ) del map_id2nid
def applyMethod(self): """index the graph. """ self.info("indexing of %s started" % self.mFilenameInputGraph) self.info("loading map_id2nid from %s" % self.mConfig.get("files", "output_nids", "adda.nids")) infile = open(self.mConfig.get("files", "output_nids", "adda.nids")) map_id2nid = AddaIO.readMapId2Nid(infile, storage=self.mConfig.get( "files", "storage_nids", "memory")) infile.close() infile = AddaIO.openStream(self.mFilenameInputGraph) cadda.indexGraph( cadda.PairsDBNeighboursIterator( self.mGraphIterator(infile, map_id2nid, self.mLogger), self.mLogger), len(map_id2nid), self.mFilenameOutputGraph, self.mFilenameOutputIndex, self.mLogger) del map_id2nid
def merge( self ): '''merge several runs. simply concatenate all files and reindex ''' f = self.mFilenameOutputGraph if self.mNumChunks == 1: raise ValueError("merge called with only one chunk" ) if os.path.exists( f ): raise ValueError( "file %s already exists - no merging" % f ) self.info( "merging file %s from %i chunks" % (f, self.mNumChunks) ) # check if all parts have finished and are present ff = [] for chunk in range( self.mNumChunks ): fn = f + self.getSlice( chunk ) if not os.path.exists( fn ): self.info("file %s is not present - merging aborted" % fn ) return False ff.append( fn ) self.info( "all files present" ) ff = " ".join( ff ) self.execute( "cat %s > %s" % (ff,f) ) self.info( "rebuilding index" ) self.info( "loading map_id2nid from %s" % self.mConfig.get( "output", "nids", "adda.nids" )) infile = open( self.mConfig.get( "output", "nids", "adda.nids" ) ) map_id2nid = AddaIO.readMapId2Nid( infile, storage = self.mConfig.get( "adda", "storage_nids", "memory" ) ) infile.close() self.info( "starting the indexing" ) cadda.reindexGraph( len(map_id2nid), self.mFilenameOutputGraph, self.mFilenameOutputIndex, self.mLogger ) return True
def merge(self): '''merge several runs. simply concatenate all files and reindex ''' f = self.mFilenameOutputGraph if self.mNumChunks == 1: raise ValueError("merge called with only one chunk") if os.path.exists(f): raise ValueError("file %s already exists - no merging" % f) self.info("merging file %s from %i chunks" % (f, self.mNumChunks)) # check if all parts have finished and are present ff = [] for chunk in range(self.mNumChunks): fn = f + self.getSlice(chunk) if not os.path.exists(fn): self.info("file %s is not present - merging aborted" % fn) return False ff.append(fn) self.info("all files present") ff = " ".join(ff) self.execute("cat %s > %s" % (ff, f)) self.info("rebuilding index") self.info("loading map_id2nid from %s" % self.mConfig.get("files", "output_nids", "adda.nids")) infile = open(self.mConfig.get("files", "output_nids", "adda.nids")) map_id2nid = AddaIO.readMapId2Nid(infile, storage=self.mConfig.get( "files", "storage_nids", "memory")) infile.close() cadda.reindexGraph(len(map_id2nid), self.mFilenameOutputGraph, self.mFilenameOutputIndex, self.mLogger) return True
def finish(self): """finish processing. add entries for sequences who only appear in the sbjct field. """ if not self.isSubset(): nids = self.mFasta.getContigSizes().keys() nadded = 0 for nid in sorted(nids): if nid not in self.mProfileLibrary: self.applyMethod(AddaIO.NeighboursRecord(nid, [])) nadded += 1 self.mOutput += nadded self.info("added %i profiles for sequences without neighbours" % nadded) self.mProfileLibrary.close() AddaModuleRecord.finish(self)
def applyMethod(self): self.mInput = 0 self.mOutput = 0 self.mRemoved = 0 self.mDuplicates = 0 # use existing fasta file iterator = FastaIterator(AddaIO.openStream(self.mFilenameInputFasta)) fasta = IndexedFasta.IndexedFasta(self.mFilenameOutputFasta, "w") outfile = self.openOutputStream(self.mFilenameNids) outfile.write("nid\tpid\thid\tlength\tsequence\n") nid = 1 hids = set() for seq in iterator: self.mInput += 1 if len(seq.sequence) > self.mMaxSequenceLength: self.mRemoved += 1 continue hid = self.getHID(seq.sequence) if hid in hids: self.mDuplicates += 1 continue hids.add(hid) outfile.write("%s\t%s\t%s\t%i\t%s\n" % (nid, seq.pid, hid, len(seq.sequence), seq.sequence)) fasta.addSequence(nid, seq.sequence) nid += 1 self.mOutput += 1 fasta.close() outfile.close()
def applyMethod(self ): self.mInput = 0 self.mOutput = 0 self.mRemoved = 0 self.mDuplicates = 0 # use existing fasta file iterator = FastaIterator( AddaIO.openStream( self.mFilenameInputFasta) ) fasta = IndexedFasta.IndexedFasta( self.mFilenameOutputFasta, "w" ) outfile = self.openOutputStream(self.mFilenameNids) outfile.write( "nid\tpid\thid\tlength\tsequence\n" ) nid = 1 hids = set() for seq in iterator: self.mInput += 1 if len( seq.sequence ) > self.mMaxSequenceLength: self.mRemoved += 1 continue hid = self.getHID( seq.sequence ) if hid in hids: self.mDuplicates += 1 continue hids.add(hid) outfile.write( "%s\t%s\t%s\t%i\t%s\n" % (nid, seq.pid, hid, len(seq.sequence), seq.sequence) ) fasta.addSequence( nid, seq.sequence ) nid += 1 self.mOutput += 1 fasta.close() outfile.close()
def applyMethod(self ): """index the graph. """ if self.isComplete(): return self.info( "setting parameters" ) config = AddaIO.ConfigParser() config.read( self.mFilenameFit ) self.mExponentialF = float( config.get( "optimise", "exponential_f" ) ) self.mExponentialE = float( config.get( "optimise", "exponential_e" ) ) cadda.setFilenameGraph( self.mFilenameGraph ) cadda.setFilenameIndex( self.mFilenameIndex ) cadda.setFilenameTransfers( self.mFilenameTransfers ) cadda.setFilenameDomains( self.mFilenameDomains ) cadda.setLogLevel( self.mLogLevel ) cadda.setReportStep( 1000 ) cadda.setMaxIterations( self.mMaxIterations ) cadda.setResolution( self.mResolution ) cadda.setExponentialF( self.mExponentialF ) cadda.setExponentialE( self.mExponentialE ) self.info( "optimisation started" ) cadda.dump_parameters() retval = cadda.optimise_initialise() if retval == 0: self.warn( "initialisation failed" ) else: self.info( "initialisation success" ) improvements = [] domains = [ self.mNSequences ] for iteration in range( self.mMaxIterations ): self.info( "iteration %i: started" % iteration) t = time.time() improvement = cadda.optimise_iteration() if improvements: rel_improvement = improvement / max(improvements) else: rel_improvement = 1 ndomains = cadda.optimise_get_num_partitions() self.info( "iteration %i: finished in %i seconds: improvement=%f, relative improvement=%f, ndomains=%i" %\ (iteration, time.time() - t, improvement, rel_improvement, ndomains) ) if cadda.optimise_save_partitions( self.mFilenameDomains ): self.info( "domains saved to %s" % self.mFilenameDomains) else: self.warn( "saving domains to %s failed" % self.mFilenameDomains) improvements.append( improvement ) domains.append( ndomains ) self.plotProgress( improvements, self.mOutputFilenameProgressImprovement, "progress: improvement" ) self.plotProgress( domains, self.mOutputFilenameProgressDomains, "progress: domains" ) self.plotProgress( map( lambda x: float( x ) / self.mNSequences, domains), self.mOutputFilenameProgressDomainsPerSequence, "progress: domains per sequence" ) if improvement < self.mMinAbsImprovement: self.info( "optimisation stopped because absolute improvement less than %f" %\ (self.mMinAbsImprovement) ) break if rel_improvement < self.mMinRelImprovement: self.info( "optimisation stopped because relative improvement less than %f" %\ (self.mMinRelImprovement) ) break else: self.info( "optimisation stopped because maximum iteration %i reached" %\ (self.mMaxIterations) ) retval = cadda.optimise_destroy() if retval == 0: self.warn( "destruction failed" ) else: self.info( "destruction success" )
usage=globals()["__doc__"], ) parser.add_option("--config", dest="filename_config", type="string", help="configuration file [default=%default].") parser.set_defaults(filename_config="adda.ini", ) (options, args) = E.Start(parser) if len(args) == 0: raise ValueError("please supply one or more nids to test.") config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) filename_graph = config.get("files", "output_graph", "adda.graph") filename_index = config.get("files", "output_index", "adda.graph.index") filename_fasta = config.get("files", "output_fasta", "adda") fasta = IndexedFasta.IndexedFasta(filename_fasta) index = cadda.IndexedNeighbours(filename_graph, filename_index) config.set("files", "output_segments", "test.segments") module = AddaSegment( config=config, fasta=fasta,
dest="method", type="choice", choices=("finish", "merge"), help="method to test " "[default=%default].") parser.set_defaults( filename_data=None, filename_overhang_hist=None, method="finish", filename_config="adda.ini", ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) filename_graph = config.get("output", "output_graph", "adda.graph") filename_index = config.get("output", "output_index", "adda.graph.index") filename_fasta = config.get("output", "output_fasta", "adda") config.set("output", "output_fit", "test.fit") config.set("output", "output_fit_data", "test.fit.data") config.set("output", "output_fit_details", "test.fit.details") config.set("output", "output_fit_overhang", "test.fit.overhang") config.set("output", "output_fit_transfer", "test.fit.transfer") fasta = None module = AddaFit(