def merge(self): """merge runs from parallel computations. returns true if merging was succecss. """ if self.isComplete(): return infiles = glob.glob("%s*" % self.mFilenameProfile) # remove suffixes infiles = list( set([x[:-4] for x in infiles if x != self.mFilenameProfile])) infiles.sort() last_nid = None found = set() ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0 tokens = set(self.mFasta.keys()) self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile, "w") for filename in infiles: infile = ProfileLibrary.ProfileLibrary(filename, "r") for nid, profile in infile.iteritems_sorted(): ninput += 1 if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 self.mProfileLibrary.add(nid, profile) noutput += 1 missing = tokens.difference(found) if len(missing) > 0: self.warn("the following nids were missing: %s" % str(missing)) self.info("adding %i missing nids" % len(missing)) for nid in missing: self.applyMethod(AddaIO.NeighboursRecord(nid, [])) self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) self.info("deleting %i parts" % len(infiles)) for infile in infiles: fn, fi = ProfileLibrary.getFileNames(infile) os.remove(fn) os.remove(fi) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def finish(self): """finish processing. add entries for sequences who only appear in the sbjct field. """ if not self.isSubset(): nids = self.mFasta.getContigSizes().keys() nadded = 0 for nid in sorted(nids): if nid not in self.mProfileLibrary: self.applyMethod(AddaIO.NeighboursRecord(nid, [])) nadded += 1 self.mOutput += nadded self.info("added %i profiles for sequences without neighbours" % nadded) self.mProfileLibrary.close() AddaModuleRecord.finish(self)
config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) filename_graph = config.get("files", "output_graph", "adda.graph") filename_index = config.get("files", "output_index", "adda.graph.index") filename_fasta = config.get("files", "output_fasta", "adda") fasta = IndexedFasta.IndexedFasta(filename_fasta) index = cadda.IndexedNeighbours(filename_graph, filename_index) config.set("files", "output_segments", "test.segments") module = AddaSegment( config=config, fasta=fasta, ) module.startUp() args = map(int, args) for nid in args: neighbours = index.getNeighbours(nid) module.applyMethod(AddaIO.NeighboursRecord(nid, neighbours)) module.finish E.Stop()