class ProgressiveNext(RoundedJob): def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule self.depProjects = depProjects def run(self, fileStore): self.configNode = ET.parse( fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects)) for projName in self.depProjects: depProject = self.depProjects[projName] for expName in depProject.expIDMap: expID = depProject.expIDMap[expName] experiment = ExperimentWrapper( ET.parse(fileStore.readGlobalFile(expID)).getroot()) fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID())) if experiment.getReferenceID(): self.project.expIDMap[expName] = expID self.project.outputSequenceIDMap[ expName] = experiment.getReferenceID() eventExpWrapper = None logger.info("Progressive Next: " + self.event) if not self.schedule.isVirtual(self.event): eventExpWrapper = self.addChild( ProgressiveUp( self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv() return self.addFollowOn( ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
class RunCactusPreprocessorThenProgressiveDown2(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse( fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() # Save preprocessed sequences if self.options.intermediateResultsUrl is not None: preprocessedSequences = self.project.outputSequenceIDMap for genome, seqID in list(preprocessedSequences.items()): fileStore.exportFile( seqID, self.options.intermediateResultsUrl + '-preprocessed-' + genome) # Log the stats for the preprocessed assemblies for name, sequence in list(self.project.outputSequenceIDMap.items()): self.addChildJobFn(logAssemblyStats, "After preprocessing", name, sequence) project = self.addChild( ProgressiveDown( options=self.options, project=self.project, event=self.event, schedule=self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() #Combine the smaller HAL files from each experiment return self.addFollowOnJobFn( exportHal, project=project, memory=self.configWrapper.getDefaultMemory(), disk=self.configWrapper.getExportHalDisk(), preemptable=False).rv()
class RunCactusPreprocessorThenProgressiveDown2(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() # Save preprocessed sequences if self.options.intermediateResultsUrl is not None: preprocessedSequences = self.project.getOutputSequenceIDMap() for genome, seqID in preprocessedSequences.items(): fileStore.exportFile(seqID, self.options.intermediateResultsUrl + '-preprocessed-' + genome) # Log the stats for the preprocessed assemblies for name, sequence in self.project.getOutputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "After preprocessing", name, sequence) project = self.addChild(ProgressiveDown(options=self.options, project=self.project, event=self.event, schedule=self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() #Combine the smaller HAL files from each experiment return self.addFollowOnJobFn(exportHal, project=project, memory=self.configWrapper.getDefaultMemory(), disk=self.configWrapper.getExportHalDisk(), preemptable=False).rv()
class RunCactusPreprocessorThenProgressiveDown(RoundedJob): def __init__(self, options, project, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project def run(self, fileStore): self.configNode = ET.parse( fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode)) # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.inputSequenceIDMap.items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals( ) #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild( CactusPreprocessor(self.project.inputSequenceIDMap.values(), configNode)) rvs = [ preprocessorJob.rv(i) for i in range(len(self.project.inputSequenceIDMap)) ] fileStore.logToMaster('input sequence IDs: %s' % self.project.inputSequenceIDMap) for genome, rv in zip(self.project.inputSequenceIDMap.keys(), rvs): self.project.outputSequenceIDMap[genome] = rv #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn( RunCactusPreprocessorThenProgressiveDown2( options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
class ProgressiveOut(RoundedJob): def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.eventExpWrapper = eventExpWrapper self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() if not self.schedule.isVirtual(self.event): tmpExp = fileStore.getLocalTempFile() self.eventExpWrapper.writeXML(tmpExp) self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp) followOnEvent = self.schedule.followOn(self.event) if followOnEvent is not None: logger.info("Adding follow-on event %s" % followOnEvent) return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() return self.project
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
class ProgressiveDown(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse( fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Down: " + self.event) depProjects = dict() deps = self.schedule.deps(self.event) fileStore.logToMaster("There are %i dependent projects" % len(deps)) for child in deps: fileStore.logToMaster("Adding dependent project %s" % child) depProjects[child] = self.addChild( ProgressiveDown(self.options, self.project, child, self.schedule)).rv() return self.addFollowOn( ProgressiveNext( self.options, self.project, self.event, self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest, locally-built docker container " "rather than pulling from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() options.cactusDir = getTempDirectory() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: project.readXML(pjPath) #import the sequences seqIDs = [] for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) logger.info("Setting config id to: %s" % cactusConfigID) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
class ProgressiveNext(RoundedJob): def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule self.depProjects = depProjects def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects)) for projName in self.depProjects: depProject = self.depProjects[projName] for expName in depProject.expIDMap: expID = depProject.expIDMap[expName] experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot()) fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID())) if experiment.getReferenceID(): self.project.expIDMap[expName] = expID self.project.outputSequenceIDMap[expName] = experiment.getReferenceID() eventExpWrapper = None logger.info("Progressive Next: " + self.event) if not self.schedule.isVirtual(self.event): eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv() return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
class ProgressiveDown(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Down: " + self.event) depProjects = dict() deps = self.schedule.deps(self.event) fileStore.logToMaster("There are %i dependent projects" % len(deps)) for child in deps: fileStore.logToMaster("Adding dependent project %s" % child) depProjects[child] = self.addChild(ProgressiveDown(self.options, self.project, child, self.schedule)).rv() return self.addFollowOn(ProgressiveNext(self.options, self.project, self.event, self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
class RunCactusPreprocessorThenProgressiveDown(RoundedJob): def __init__(self, options, project, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode)) # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.getInputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode)) self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))]) #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()