def merge(self): """merge runs from parallel computations. returns true if merging was succecss. """ if self.isComplete(): return infiles = glob.glob("%s*" % self.mFilenameProfile) # remove suffixes infiles = list( set([x[:-4] for x in infiles if x != self.mFilenameProfile])) infiles.sort() last_nid = None found = set() ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0 tokens = set(self.mFasta.keys()) self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile, "w") for filename in infiles: infile = ProfileLibrary.ProfileLibrary(filename, "r") for nid, profile in infile.iteritems_sorted(): ninput += 1 if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 self.mProfileLibrary.add(nid, profile) noutput += 1 missing = tokens.difference(found) if len(missing) > 0: self.warn("the following nids were missing: %s" % str(missing)) self.info("adding %i missing nids" % len(missing)) for nid in missing: self.applyMethod(AddaIO.NeighboursRecord(nid, [])) self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) self.info("deleting %i parts" % len(infiles)) for infile in infiles: fn, fi = ProfileLibrary.getFileNames(infile) os.remove(fn) os.remove(fi) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def merge(self): """merge runs from parallel computations. returns true if merging was succecss. """ if self.isComplete(): return infiles = glob.glob( "%s*" % self.mFilenameProfile ) # remove suffixes infiles = list(set([ x[:-4] for x in infiles if x != self.mFilenameProfile ])) infiles.sort() last_nid = None found = set() ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0 tokens = set(self.mFasta.keys()) self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile, "w" ) for filename in infiles: infile = ProfileLibrary.ProfileLibrary( filename, "r" ) for nid, profile in infile.iteritems_sorted(): ninput += 1 if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 self.mProfileLibrary.add( nid, profile ) noutput += 1 missing = tokens.difference( found ) if len(missing) > 0: self.warn( "the following nids were missing: %s" % str(missing) ) self.info( "adding %i missing nids" % len(missing)) for nid in missing: self.applyMethod( AddaIO.NeighboursRecord( nid, [] ) ) self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) self.info( "deleting %i parts" % len(infiles) ) for infile in infiles: fn, fi = ProfileLibrary.getFileNames( infile ) os.remove( fn ) os.remove( fi ) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def startUp(self): if self.isComplete(): return if self.mAppend: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "a") self.mContinueAt = self.mProfileLibrary.getLastInsertedKey() self.info("processing will continue after %s" % (str(self.mContinueAt))) else: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "w", force=self.mForce) # set default values self.mProfileLibrary.setLogOddor( alignlib.makeLogOddorDirichlet(self.mScaleFactor)) self.mProfileLibrary.setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) self.mProfileLibrary.setWeightor(alignlib.makeWeightor()) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.Protein20))
def outputSummaryProfiles(self): """analyse the alignments.""" if not os.path.exists(self.mFilenameProfiles): return {'nids': 0} self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r") nids = self.mProfileLibrary.keys() self.mOutfile.write(">%s\n" % self.mFilenameProfiles) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids)) return {'nids': len(nids)}
def startUp( self ): if self.isComplete(): return ############################################### # create objects for algorithm alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) self.mLogOddor = alignlib.makeLogOddorDirichlet( self.mScaleFactor ) self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed() self.mWeightor = alignlib.makeWeightor() alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor ) alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor ) alignlib.getDefaultToolkit().setWeightor( self.mWeightor ) if self.mUsePrebuiltProfiles: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" ) self.mProfileLibrary.setWeightor( self.mWeightor ) self.mProfileLibrary.setLogOddor( self.mLogOddor ) self.mProfileLibrary.setRegularizor( self.mRegularizor ) else: self.mProfileLibrary = None self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex ) self.mChecker = self.checkLinkZScore self.mHeader = ("qdomain", "sdomain", "weight", "passed", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore" ) self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, self.mGop, self.mGep ) # the cache to store alignandum objects self.mCache = {} alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) ## initialize counters self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0 self.mOutfile = self.openOutputStream( self.mFilenameAlignments ) if self.mContinueAt == None: self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) self.mOutfile.flush() self.mStartTime = time.time()
def isComplete(self): fn, fi = ProfileLibrary.getFileNames(self.mFilenameProfile + self.getSlice()) return SegmentedFile.isComplete(fi)
def isComplete( self ): fn, fi = ProfileLibrary.getFileNames( self.mFilenameProfile + self.getSlice() ) return SegmentedFile.isComplete( fi )