def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help= "The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.info( "Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem( jobStore.config ) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs( ): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(): parser = parser_with_common_options() options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) config.jobStore = config.jobStore[5:] if config.jobStore.startswith('file:') else config.jobStore # ':' means an aws/google jobstore; use the old (broken?) method if ':' in config.jobStore: jobStore = Toil.resumeJobStore(config.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 batchSystem = Toil.createBatchSystem(jobStore.config) # Should automatically kill existing jobs, so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed") # otherwise, kill the pid recorded in the jobstore else: pid_log = os.path.join(os.path.abspath(config.jobStore), 'pid.log') with open(pid_log, 'r') as f: pid2kill = f.read().strip() try: os.kill(int(pid2kill), signal.SIGKILL) logger.info("Toil process %s successfully terminated." % str(pid2kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid2kill)) raise
def main(): parser = parser_with_common_options(jobstore_option=True) parser.add_argument( "jobID", nargs=1, help= "The job store id of a job within the provided jobstore to run by itself." ) parser.add_argument( "--printJobInfo", nargs=1, help= "Return information about this job to the user including preceding jobs, " "inputs, outputs, and runtime from the last known run.") options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) if options.printJobInfo: printContentsOfJobStore(jobStorePath=config.jobStore, nameOfJob=options.printJobInfo) # TODO: Option to print list of successor jobs # TODO: Option to run job within python debugger, allowing step through of arguments # idea would be to have option to import pdb and set breakpoint at the start of the user's code jobID = options.jobID[0] logger.debug(f"Running the following job locally: {jobID}") workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False) logger.debug(f"Finished running: {jobID}")
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) config = Config() config.setOptions(parseBasicOptions(parser)) logger.info("Attempting to delete the job store") jobStore = Toil.getJobStore(config.jobStore) jobStore.destroy() logger.info("Successfully deleted the job store")
def _createDummyConfig(): """ Returns a dummy config for the batch system tests. We need a workflowID to be set up since we are running tests without setting up a jobstore. :rtype: toil.common.Config """ config = Config() from uuid import uuid4 config.workflowID = str(uuid4()) return config
def createConfig(cls): """ Returns a dummy config for the batch system tests. We need a workflowID to be set up since we are running tests without setting up a jobstore. This is the class version to be used when an instance is not available. :rtype: toil.common.Config """ config = Config() from uuid import uuid4 config.workflowID = str(uuid4()) return config
def main(): """ Reports stats on the workflow, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) stats = getStats(jobStore) collatedStatsTag = processData(jobStore.config, stats) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("--localFilePath", nargs=1, help="Location to which to copy job store files.") parser.add_argument("--fetch", nargs="+", help="List of job-store files to be copied locally." "Use either explicit names (i.e. 'data.txt'), or " "specify glob patterns (i.e. '*.txt')") parser.add_argument( "--listFilesInJobStore", help="Prints a list of the current files in the jobStore.") parser.add_argument( "--fetchEntireJobStore", help="Copy all job store files into a local directory.") parser.add_argument( "--useSymlinks", help="Creates symlink 'shortcuts' of files in the localFilePath" " instead of hardlinking or copying, where possible. If this is" " not possible, it will copy the files (shutil.copyfile()).") parser.add_argument("--version", action='version', version=version) # Load the jobStore options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.debug("Connected to job store: %s", config.jobStore) if options.fetch: # Copy only the listed files locally logger.debug("Fetching local files: %s", options.fetch) fetchJobStoreFiles(jobStore=jobStore, options=options) elif options.fetchEntireJobStore: # Copy all jobStore files locally logger.debug("Fetching all local files.") options.fetch = "*" fetchJobStoreFiles(jobStore=jobStore, options=options) if options.listFilesInJobStore: # Log filenames and create a file containing these names in cwd printContentsOfJobStore(jobStorePath=options.jobStore)
def testMultipleJobsPerWorkerStats(self): """ Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.clean = 'never' options.stats = True Job.Runner.startToil(RunTwoJobsPerWorker(), options) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) stats = getStats(jobStore) collatedStats = processData(jobStore.config, stats) self.assertTrue(len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats")
def main(): """ This is a Toil pipeline to transfer TCGA data into an S3 Bucket Data is pulled down with Genetorrent and transferred to S3 via S3AM. """ # Define Parser object and add to toil def existing_file(fname): """ Argparse type for an existing file """ if not os.path.isfile(fname): raise ValueError("Invalid file: " + str(fname)) return fname parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '--sudo', dest='sudo', default=None, action='store_true', help= 'Docker usually needs sudo to execute locally, but not when running Mesos or when ' 'the user is a member of a Docker group.') Job.Runner.addToilOptions(parser) parser.add_argument('datafiles', nargs='+', help='FASTA input', type=existing_file) args = parser.parse_args() assert args.jobStore is not None config = Config() config.setOptions(args) # Store inputs from argparse inputs = {'sudo': args.sudo} datafiles = [os.path.abspath(d) for d in args.datafiles] # Start Pipeline options = Job.Runner.getDefaultOptions("./toilWorkflow") Job.Runner.startToil(Job.wrapJobFn(start_batch, datafiles, inputs), options)
def setUp(self): super(WorkerTests, self).setUp() path = self._getTestJobStorePath() self.jobStore = FileJobStore(path) self.config = Config() self.config.jobStore = 'file:%s' % path self.jobStore.initialize(self.config) self.jobNumber = 0
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) config = Config() config.setOptions(parseBasicOptions(parser)) try: jobStore = Toil.getJobStore(config.jobStore) jobStore.resume() jobStore.destroy() logger.info("Successfully deleted the job store: %s" % config.jobStore) except NoSuchJobStoreException: logger.info("Failed to delete the job store: %s is non-existent" % config.jobStore) except: logger.info("Failed to delete the job store: %s" % config.jobStore) raise
def main() -> None: """Reports stats on the workflow, use with --stats option to toil.""" parser = parser_with_common_options() add_stats_options(parser) options = parser.parse_args() for c in options.categories.split(","): if c.strip() not in category_choices: raise ValueError(f'{c} not in {category_choices}!') options.categories = [ x.strip().lower() for x in options.categories.split(",") ] set_logging_from_options(options) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) stats = getStats(jobStore) collatedStatsTag = processData(jobStore.config, stats) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("jobID", nargs=1, help="The job store id of a job " "within the provided jobstore to run by itself.") parser.add_argument( "--printJobInfo", nargs=1, help="Return information about this job to the user" " including preceding jobs, inputs, outputs, and runtime" " from the last known run.") parser.add_argument("--version", action='version', version=version) # Parse options options = parseBasicOptions(parser) config = Config() config.setOptions(options) # Load the job store jobStore = Toil.resumeJobStore(config.jobStore) if options.printJobInfo: printContentsOfJobStore(jobStorePath=options.jobStore, nameOfJob=options.printJobInfo) # TODO: Option to print list of successor jobs # TODO: Option to run job within python debugger, allowing step through of arguments # idea would be to have option to import pdb and set breakpoint at the start of the user's code # Run the job locally jobID = options.jobID[0] logger.debug("Going to run the following job locally: %s", jobID) workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False) logger.debug("Ran the following job locally: %s", jobID)
def setUp(self): super(ClusterScalerTest, self).setUp() self.config = Config() self.config.targetTime = 1800 self.config.nodeTypes = ['r3.8xlarge', 'c4.8xlarge:0.6'] # Set up a stub provisioner with some nodeTypes and nodeShapes. self.provisioner = object() self.provisioner.nodeTypes = ['r3.8xlarge', 'c4.8xlarge'] self.provisioner.nodeShapes = [r3_8xlarge, c4_8xlarge_preemptable] self.provisioner.setStaticNodes = lambda _, __: None self.provisioner.retryPredicate = lambda _: False self.leader = MockBatchSystemAndProvisioner(self.config, 1)
def setUp(self): super(ClusterScalerTest, self).setUp() self.config = Config() self.config.targetTime = 1800 self.config.nodeTypes = [r3_8xlarge, c4_8xlarge_preemptable] # Set up the mock leader self.leader = MockBatchSystemAndProvisioner(self.config, 1) # It is also a full mock provisioner, so configure it to be that as well self.provisioner = self.leader # Pretend that Shapes are actually strings we can use for instance type names. self.provisioner.setAutoscaledNodeTypes([ ({t}, None) for t in self.config.nodeTypes ])
def main() -> None: parser = parser_with_common_options() options = parser.parse_args() set_logging_from_options(options) config = Config() config.setOptions(options) job_store_type, _ = Toil.parseLocator(config.jobStore) if job_store_type != 'file': # Remote (aws/google) jobstore; use the old (broken?) method job_store = Toil.resumeJobStore(config.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 # There's no guarantee that the batch system in use can enumerate # running jobs belonging to the job store we've attached to. And # moreover we don't even bother trying to kill the leader at its # recorded PID, even if it is a local process. batch_system = Toil.createBatchSystem(job_store.config) # Should automatically kill existing jobs, so we're good. for job_id in batch_system.getIssuedBatchJobIDs(): # Just in case we do it again. batch_system.killBatchJobs([job_id]) logger.info("All jobs SHOULD have been killed") else: # otherwise, kill the pid recorded in the jobstore. # TODO: We assume thnis is a local PID. job_store = Toil.resumeJobStore(config.jobStore) assert isinstance(job_store, FileJobStore), "Need a FileJobStore which has a sharedFilesDir" pid_log = os.path.join(job_store.sharedFilesDir, 'pid.log') with open(pid_log) as f: pid_to_kill = f.read().strip() try: os.kill(int(pid_to_kill), signal.SIGTERM) logger.info("Toil process %s successfully terminated." % str(pid_to_kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid_to_kill)) raise
def test(self): # We'll use fractions to avoid rounding errors. Remember that not every fraction can be # represented as a floating point number. F = Fraction # This test isn't general enough to cover every possible value of minCores in # SingleMachineBatchSystem. Instead we hard-code a value and assert it. minCores = F(1, 10) self.assertEquals(float(minCores), SingleMachineBatchSystem.minCores) for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}: for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}: for load in (F(1, 10), F(1), F(10)): jobs = int(maxCores / coresPerJob * load) if jobs >= 1 and minCores <= coresPerJob < maxCores: self.assertEquals(maxCores, float(maxCores)) bs = SingleMachineBatchSystem(config=Config(), maxCores=float(maxCores), # Ensure that memory or disk requirements # don't get in the way. maxMemory=jobs * 10, maxDisk=jobs * 10) try: jobIds = set() for i in range(0, int(jobs)): jobIds.add(bs.issueBatchJob(command=self.scriptCommand(), cores=float(coresPerJob), memory=1, disk=1, preemptable=preemptable)) self.assertEquals(len(jobIds), jobs) while jobIds: job = bs.getUpdatedBatchJob(maxWait=10) self.assertIsNotNone(job) jobId, status, wallTime = job self.assertEquals(status, 0) # would raise KeyError on absence jobIds.remove(jobId) finally: bs.shutdown() concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath) self.assertEquals(concurrentTasks, 0) log.info('maxCores: {maxCores}, ' 'coresPerJob: {coresPerJob}, ' 'load: {load}'.format(**locals())) # This is the key assertion: expectedMaxConcurrentTasks = min(maxCores / coresPerJob, jobs) self.assertEquals(maxConcurrentTasks, expectedMaxConcurrentTasks) resetCounters(self.counterPath)
def _createDummyConfig(): config = Config() """ config = ElementTree.Element("config") config.attrib["log_level"] = 'DEBUG' config.attrib["job_store"] = '.' config.attrib["parasol_command"] = None config.attrib["try_count"] = str(2) config.attrib["max_job_duration"] = str(1) config.attrib["batch_system"] = None config.attrib["max_log_file_size"] = str(1) config.attrib["default_memory"] = str(1) config.attrib["default_cores"] = str(1) config.attrib["max_cores"] = str(1) config.attrib["max_memory"] = str(1) config.attrib["scale"] = str(1) """ return config
def setUp(self): super(ClusterScalerTest, self).setUp() self.config = Config() self.config.targetTime = 1800 self.config.nodeTypes = ['r3.8xlarge', 'c4.8xlarge:0.6'] # Set up a stub provisioner with some nodeTypes and nodeShapes. try: # In Python 3 we can use a SimpleNamespace as a mock provisioner self.provisioner = types.SimpleNamespace() except: # In Python 2 we can just tack fields onto an object self.provisioner = object() setattr(self.provisioner, 'nodeTypes', ['r3.8xlarge', 'c4.8xlarge']) setattr(self.provisioner, 'nodeShapes', [r3_8xlarge, c4_8xlarge_preemptable]) setattr(self.provisioner, 'setStaticNodes', lambda _, __: None) setattr(self.provisioner, 'retryPredicate', lambda _: False) self.leader = MockBatchSystemAndProvisioner(self.config, 1)
def setUp(self): super(ClusterScalerTest, self).setUp() self.config = Config() self.config.targetTime = 1800 self.config.nodeTypes = ['r3.8xlarge', 'c4.8xlarge:0.6'] # Set up a stub provisioner with some nodeTypes and nodeShapes. try: # In Python 3 we can use a SimpleNamespace as a mock provisioner self.provisioner = types.SimpleNamespace() except: # In Python 2 we should just be able to tack fields onto an object. # But this has been known to produce: # AttributeError: 'newobject' object has no attribute 'nodeTypes' # So we use an Argparse Namespace instead. import argparse self.provisioner = argparse.Namespace() setattr(self.provisioner, 'nodeTypes', ['r3.8xlarge', 'c4.8xlarge']) setattr(self.provisioner, 'nodeShapes', [r3_8xlarge, c4_8xlarge_preemptable]) setattr(self.provisioner, 'setStaticNodes', lambda _, __: None) setattr(self.provisioner, 'retryPredicate', lambda _: False) self.leader = MockBatchSystemAndProvisioner(self.config, 1)
def _createDummyConfig(self): return Config()
def testClusterScalingMultipleNodeTypes(self): smallNode = Shape(20, 5, 10, 10, False) mediumNode = Shape(20, 10, 10, 10, False) largeNode = Shape(20, 20, 10, 10, False) numJobs = 100 config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.preemptableNodeTypes = [] config.minPreemptableNodes = [] config.maxPreemptableNodes = [] # No preemptable nodes # Make sure the node types don't have to be ordered config.nodeTypes = [largeNode, smallNode, mediumNode] config.minNodes = [0, 0, 0] config.maxNodes = [10, 10] # test expansion of this list # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.1 config.scaleInterval = 3 mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() mock.start() try: # Add small jobs list( map(lambda x: mock.addJob(jobShape=smallNode), list(range(numJobs)))) list( map(lambda x: mock.addJob(jobShape=mediumNode), list(range(numJobs)))) # Add medium completed jobs for i in range(1000): iJ = JobNode(jobStoreID=1, requirements=dict(memory=random.choice( range(smallNode.memory, mediumNode.memory)), cores=mediumNode.cores, disk=largeNode.cores, preemptable=False), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10))) while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes( ) > 0: logger.debug("%i nodes currently provisioned" % mock.getNumberOfNodes()) # Make sure there are no large nodes self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0) clusterScaler.check() time.sleep(0.5) finally: clusterScaler.shutdown() mock.shutDown() # Make sure jobs ran on both the small and medium node types self.assertTrue(mock.totalJobs > 0) self.assertTrue(mock.maxWorkers[smallNode] > 0) self.assertTrue(mock.maxWorkers[mediumNode] > 0) self.assertEqual(mock.maxWorkers[largeNode], 0)
def main(): """Reports the state of a Toil workflow.""" parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--noAggStats", dest="stats", action="store_false", help="Do not print overall, aggregate status of workflow.", default=True) parser.add_argument("--printDot", action="store_true", help="Print dot formatted description of the graph. If using --jobs will " "restrict to subgraph including only those jobs. default=%(default)s", default=False) parser.add_argument("--jobs", nargs='+', help="Restrict reporting to the following jobs (allows subsetting of the report).", default=None) parser.add_argument("--printPerJobStats", action="store_true", help="Print info about each job. default=%(default)s", default=False) parser.add_argument("--printLogs", action="store_true", help="Print the log files of jobs (if they exist). default=%(default)s", default=False) parser.add_argument("--printChildren", action="store_true", help="Print children of each job. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) ########################################## # Gather the jobs to report ########################################## # Gather all jobs in the workflow in jobsToReport if options.jobs == None: rootJob = fetchRootJob(jobStore) logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') jobsToReport = traverseJobGraph(rootJob, jobStore) # Only gather jobs specified in options.jobs else: jobsToReport = fetchUserJobs(jobStore, jobs=options.jobs) ########################################## # Report on the jobs ########################################## jobStats = report_on_jobs(jobsToReport, jobStore, options) hasChildren = jobStats['hasChildren'] readyToRun = jobStats['readyToRun'] zombies = jobStats['zombies'] hasServices = jobStats['hasServices'] services = jobStats['services'] hasLogFile = jobStats['hasLogFile'] properties = jobStats['properties'] childNumber = jobStats['childNumber'] if options.printPerJobStats: printAggregateJobStats(jobsToReport, properties, childNumber) if options.printLogs: printJobLog(jobsToReport, jobStore) if options.printChildren: printJobChildren(jobsToReport) if options.printDot: print_dot_chart(jobsToReport, jobStore_name=config.jobStore) if options.stats: print('Of the %i jobs considered, ' 'there are %i jobs with children, ' '%i jobs ready to run, ' '%i zombie jobs, ' '%i jobs with services, ' '%i services, ' 'and %i jobs with log files currently in %s.' % (len(jobsToReport), len(hasChildren), len(readyToRun), len(zombies), len(hasServices), len(services), len(hasLogFile), config.jobStore)) if len(jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1)
def testClusterScalingWithPreemptableJobs(self): """ Test scaling simultaneously for a batch of preemptable and non-preemptable jobs. """ config = Config() jobShape = Shape(20, 10, 10, 10, False) preemptableJobShape = Shape(20, 10, 10, 10, True) # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # non-preemptable node parameters config.nodeTypes = [jobShape, preemptableJobShape] config.minNodes = [0, 0] config.maxNodes = [10, 10] # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.9 config.scaleInterval = 3 self._testClusterScaling(config, numJobs=100, numPreemptableJobs=100, jobShape=jobShape)
def main(args=None, stdout=sys.stdout): config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument("--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker " "command (like udocker or dx-docker) that will be " "used to call 'pull' and 'run'") parser.add_argument("--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system." parser.add_argument("--beta-dependency-resolvers-configuration", default=None) # help="Defaut root directory used by dependency resolvers configuration." parser.add_argument("--beta-dependencies-directory", default=None) # help="Use biocontainers for tools without an explicitly annotated Docker container." parser.add_argument("--beta-use-biocontainers", default=None, action="store_true") # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages." parser.add_argument("--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} make_tool_kwargs = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) # Text use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) # Text job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) # type: DependenciesConfiguration job_script_provider = dependencies_configuration options.default_container = None make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict make_tool_kwargs["hints"] = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, kwargs=make_tool_kwargs, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job, options.basedir, loader = cwltool.main.load_job_order( options, sys.stdin, None, [], options.job_order) job = cwltool.main.init_job_order(job, options, t, loader=loader) fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) try: make_opts = copy.deepcopy(vars(options)) make_opts.update({'tool': t, 'jobobj': {}, 'use_container': use_container, 'tmpdir': os.path.realpath(outdir), 'job_script_provider': job_script_provider}) (wf1, wf2) = makeJob(**make_opts) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def main() -> None: """Reports the state of a Toil workflow.""" parser = parser_with_common_options() parser.add_argument( "--failIfNotComplete", action="store_true", help= "Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument( "--noAggStats", dest="stats", action="store_false", help="Do not print overall, aggregate status of workflow.", default=True) parser.add_argument( "--printDot", action="store_true", help= "Print dot formatted description of the graph. If using --jobs will " "restrict to subgraph including only those jobs. default=%(default)s", default=False) parser.add_argument( "--jobs", nargs='+', help= "Restrict reporting to the following jobs (allows subsetting of the report).", default=None) parser.add_argument("--printPerJobStats", action="store_true", help="Print info about each job. default=%(default)s", default=False) parser.add_argument( "--printLogs", action="store_true", help="Print the log files of jobs (if they exist). default=%(default)s", default=False) parser.add_argument("--printChildren", action="store_true", help="Print children of each job. default=%(default)s", default=False) options = parser.parse_args() set_logging_from_options(options) if len(sys.argv) == 1: parser.print_help() sys.exit(0) config = Config() config.setOptions(options) try: status = ToilStatus(config.jobStore, options.jobs) except NoSuchJobStoreException: print('No job store found.') return except JobException: # Workflow likely complete, user informed in ToilStatus() return jobStats = status.report_on_jobs() # Info to be reported. hasChildren = jobStats['hasChildren'] readyToRun = jobStats['readyToRun'] zombies = jobStats['zombies'] hasServices = jobStats['hasServices'] services = jobStats['services'] hasLogFile = jobStats['hasLogFile'] properties = jobStats['properties'] childNumber = jobStats['childNumber'] if options.printPerJobStats: status.printAggregateJobStats(properties, childNumber) if options.printLogs: status.printJobLog() if options.printChildren: status.printJobChildren() if options.printDot: status.print_dot_chart() if options.stats: print('Of the %i jobs considered, ' 'there are %i jobs with children, ' '%i jobs ready to run, ' '%i zombie jobs, ' '%i jobs with services, ' '%i services, ' 'and %i jobs with log files currently in %s.' % (len(status.jobsToReport), len(hasChildren), len(readyToRun), len(zombies), len(hasServices), len(services), len(hasLogFile), status.jobStore)) if len(status.jobsToReport) > 0 and options.failIfNotComplete: # Upon workflow completion, all jobs will have been removed from job store exit(1)
def main(args=None, stdout=sys.stdout): """Main method for toil-cwl-runner.""" cwllogger.removeHandler(defaultStreamHandler) config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) dockergroup = parser.add_mutually_exclusive_group() dockergroup.add_argument( "--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker command (like " "udocker or dx-docker) that will be used to call 'pull' and 'run'") dockergroup.add_argument( "--singularity", action="store_true", default=False, help="[experimental] Use Singularity runtime for running containers. " "Requires Singularity v2.3.2+ and Linux with kernel version v3.18+ or " "with overlayfs support backported.") dockergroup.add_argument( "--no-container", action="store_true", help="Do not execute jobs in a " "Docker container, even when `DockerRequirement` " "is specified under `hints`.") parser.add_argument( "--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running" " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") parser.add_argument( "--destBucket", type=str, help="Specify a cloud bucket endpoint for output files.") parser.add_argument( "--beta-dependency-resolvers-configuration", default=None) parser.add_argument("--beta-dependencies-directory", default=None) parser.add_argument( "--beta-use-biocontainers", default=None, action="store_true") parser.add_argument( "--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") parser.add_argument( "--force-docker-pull", action="store_true", default=False, dest="force_docker_pull", help="Pull latest docker image even if it is locally present") parser.add_argument( "--no-match-user", action="store_true", default=False, help="Disable passing the current uid to `docker run --user`") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] # we use workdir as jobStore: options = parser.parse_args([workdir] + args) # if tmpdir_prefix is not the default value, set workDir too if options.tmpdir_prefix != 'tmp': options.workDir = options.tmpdir_prefix if options.provisioner and not options.jobStore: raise NoSuchJobStoreException( 'Please specify a jobstore with the --jobStore option when specifying a provisioner.') use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) tmp_outdir_prefix = os.path.abspath(options.tmp_outdir_prefix) tmpdir_prefix = os.path.abspath(options.tmpdir_prefix) fileindex = {} existing = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) job_script_provider = dependencies_configuration options.default_container = None runtime_context = cwltool.context.RuntimeContext(vars(options)) runtime_context.find_default_container = functools.partial( find_default_container, options) runtime_context.workdir = workdir runtime_context.move_outputs = "leave" runtime_context.rm_tmpdir = False loading_context = cwltool.context.LoadingContext(vars(options)) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: loading_context.hints = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] loading_context.construct_tool_object = toil_make_tool loading_context.resolver = cwltool.resolver.tool_resolver loading_context.strict = not options.not_strict options.workflow = options.cwltool options.job_order = options.cwljob uri, tool_file_uri = cwltool.load_tool.resolve_tool_uri( options.cwltool, loading_context.resolver, loading_context.fetcher_constructor) options.tool_help = None options.debug = options.logLevel == "DEBUG" job_order_object, options.basedir, jobloader = \ cwltool.main.load_job_order( options, sys.stdin, loading_context.fetcher_constructor, loading_context.overrides_list, tool_file_uri) document_loader, workflowobj, uri = \ cwltool.load_tool.fetch_document( uri, loading_context.resolver, loading_context.fetcher_constructor) document_loader, avsc_names, processobj, metadata, uri = \ cwltool.load_tool.validate_document( document_loader, workflowobj, uri, loading_context.enable_dev, loading_context.strict, False, loading_context.fetcher_constructor, False, loading_context.overrides_list, do_validate=loading_context.do_validate) loading_context.overrides_list.extend( metadata.get("cwltool:overrides", [])) try: tool = cwltool.load_tool.make_tool( document_loader, avsc_names, metadata, uri, loading_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 runtime_context.secret_store = SecretStore() initialized_job_order = cwltool.main.init_job_order( job_order_object, options, tool, jobloader, sys.stdout, secret_store=runtime_context.secret_store) fs_access = cwltool.stdfsaccess.StdFsAccess(options.basedir) fill_in_defaults( tool.tool["inputs"], initialized_job_order, fs_access) def path_to_loc(obj): if "location" not in obj and "path" in obj: obj["location"] = obj["path"] del obj["path"] def import_files(tool): visit_class(tool, ("File", "Directory"), path_to_loc) visit_class(tool, ("File", ), functools.partial( add_sizes, fs_access)) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial( get_listing, fs_access, recursive=True)) adjustFileObjs(tool, functools.partial( uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) tool.visit(import_files) for inp in tool.tool["inputs"]: def set_secondary(fileobj): if isinstance(fileobj, Mapping) \ and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [ {"location": cwltool.builder.substitute( fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, MutableSequence): for entry in fileobj: set_secondary(entry) if shortname(inp["id"]) in initialized_job_order \ and inp.get("secondaryFiles"): set_secondary(initialized_job_order[shortname(inp["id"])]) import_files(initialized_job_order) visitSteps(tool, import_files) try: runtime_context.use_container = use_container runtime_context.tmpdir = os.path.realpath(tmpdir_prefix) runtime_context.tmp_outdir_prefix = os.path.realpath( tmp_outdir_prefix) runtime_context.job_script_provider = job_script_provider runtime_context.force_docker_pull = options.force_docker_pull runtime_context.no_match_user = options.no_match_user (wf1, _) = makeJob(tool, {}, None, runtime_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 wf1.cwljob = initialized_job_order if wf1 is CWLJob: # Clean up temporary directories only created with CWLJobs. wf1.addFollowOnFn(cleanTempDirs, wf1) outobj = toil.start(wf1) outobj = resolve_indirect(outobj) # Stage files. Specify destination bucket if specified in CLI # options. If destination bucket not passed in, # options.destBucket's value will be None. toilStageFiles( toil, outobj, outdir, fileindex, existing, export=True, destBucket=options.destBucket) if not options.destBucket: visit_class(outobj, ("File",), functools.partial( compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) visit_class(outobj, ("File", ), MutationManager().unset_generation) stdout.write(json.dumps(outobj, indent=4)) return 0
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in [x.jobStoreID for x in jobs]: if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in [x.jobStoreID for x in jobs]: if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info('There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join((str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit(1) # when the workflow is complete, all jobs will have been removed from job store
def testClusterScalingWithPreemptableJobs(self): """ Test scaling simultaneously for a batch of preemptable and non-preemptable jobs. """ config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # Preemptable node parameters config.nodeType = Shape(20, 10, 10, 10) config.minNodes = 0 config.maxNodes = 10 # Preemptable node parameters config.preemptableNodeType = Shape(20, 10, 10, 10) config.minPreemptableNodes = 0 config.maxPreemptableNodes = 10 # Algorithm parameters config.alphaPacking = 0.8 config.betaInertia = 1.2 config.scaleInterval = 3 self._testClusterScaling(config, numJobs=100, numPreemptableJobs=100)
def testClusterScaling(self): """ Test scaling for a batch of non-preemptable jobs and no preemptable jobs (makes debugging easier). """ config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.maxPreemptableNodes = 0 # No preemptable nodes # Non-preemptable parameters config.nodeType = Shape(20, 10, 10, 10) config.minNodes = 0 config.maxNodes = 10 # Algorithm parameters config.alphaPacking = 0.8 config.betaInertia = 1.2 config.scaleInterval = 3 self._testClusterScaling(config, numJobs=100, numPreemptableJobs=0)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobGraph we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), config.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def _createConfig(self): return Config()
def testClusterScaling(self): """ Test scaling for a batch of non-preemptable jobs and no preemptable jobs (makes debugging easier). """ config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.maxPreemptableNodes = [] # No preemptable nodes # Non-preemptable parameters config.nodeTypes = [Shape(20, 10, 10, 10, False)] config.minNodes = [0] config.maxNodes = [10] # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.1 config.scaleInterval = 3 self._testClusterScaling(config, numJobs=100, numPreemptableJobs=0, jobShape=config.nodeTypes[0])