def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help=( "Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown." ), ) parser.add_argument("--version", action="version", version=version) options = parseBasicOptions(parser) jobStore = Toil.loadOrCreateJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem( jobStore.config ) # This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
class JobGraphTest(ToilTest): def setUp(self): super(JobGraphTest, self).setUp() self.jobStorePath = self._getTestJobStorePath() parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args(args=[self.jobStorePath]) self.toil = Toil(options) self.assertEquals( self.toil, self.toil.__enter__() ) def tearDown(self): self.toil.__exit__(None, None, None) self.toil._jobStore.destroy() self.assertFalse(os.path.exists(self.jobStorePath)) super(JobGraphTest, self).tearDown() def testJob(self): """ Tests functions of a job. """ command = "by your command" memory = 2^32 disk = 2^32 cores = 1 preemptable = 1 jobStoreID = 100 remainingRetryCount = 5 predecessorNumber = 0 j = JobGraph(command=command, memory=memory, cores=cores, disk=disk, preemptable=preemptable, jobStoreID=jobStoreID, remainingRetryCount=remainingRetryCount, predecessorNumber=predecessorNumber, jobName='testJobGraph', unitName='noName') #Check attributes # self.assertEquals(j.command, command) self.assertEquals(j.memory, memory) self.assertEquals(j.disk, disk) self.assertEquals(j.cores, cores) self.assertEquals(j.preemptable, preemptable) self.assertEquals(j.jobStoreID, jobStoreID) self.assertEquals(j.remainingRetryCount, remainingRetryCount) self.assertEquals(j.predecessorNumber, predecessorNumber) self.assertEquals(j.stack, []) self.assertEquals(j.predecessorsFinished, set()) self.assertEquals(j.logJobStoreFileID, None) #Check equals function j2 = JobGraph(command=command, memory=memory, cores=cores, disk=disk, preemptable=preemptable, jobStoreID=jobStoreID, remainingRetryCount=remainingRetryCount, predecessorNumber=predecessorNumber, jobName='testJobGraph', unitName='noName') self.assertEquals(j, j2) #Change an attribute and check not equal j.predecessorsFinished = {"1", "2"} self.assertNotEquals(j, j2)
def _importExportFile(self, options, fail): with Toil(options) as toil: if not options.restart: srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4()) with open(srcFile, 'w') as f: f.write('Hello') inputFileID = toil.importFile('file://' + srcFile) # Make sure that importFile returns the fileID wrapper self.assertIsInstance(inputFileID, FileID) self.assertEqual(os.stat(srcFile).st_size, inputFileID.size) # Write a boolean that determines whether the job fails. with toil._jobStore.writeFileStream() as (f, failFileID): self.failFileID = failFileID f.write(str(fail)) outputFileID = toil.start( RestartingJob(inputFileID, self.failFileID)) else: # Set up job for failure with toil._jobStore.updateFileStream(self.failFileID) as f: f.write('False') outputFileID = toil.restart() toil.exportFile(outputFileID, 'file://' + self.dstFile) with open(self.dstFile, 'r') as f: assert f.read() == "HelloWorld!"
def main(): """Removes the JobStore from a toil run. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") ########################################## #Survey the status of the job and report. ########################################## logger.info("Checking if we have files for toil") try: jobStore = Toil.loadOrCreateJobStore(options.jobStore) except JobStoreCreationException: logger.info("The specified JobStore does not exist, it may have already been deleted") sys.exit(0) logger.info("Deleting the JobStore") jobStore.deleteJobStore()
def userScript(): from toil.job import Job from toil.common import Toil # A user-defined type, i.e. a type defined in the user script class X(object): pass # noinspection PyUnusedLocal def job(job, x, disk='10M', cores=1, memory='10M'): return x if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args( ) x = X() with Toil(options) as toil: r = toil.start(Job.wrapJobFn(job, x).encapsulate()) # Assert that the return value is of type X, but not X from the __main__ # module but X from foo.bar, the canonical name for the user module. The # translation from __main__ to foo.bar is a side effect of hot-deployment. assert r.__class__ is not X import foo.bar assert r.__class__ is foo.bar.X # Assert that a copy was made. This is a side effect of pickling/unpickling. assert x is not r
def main(argv=None): if argv is None: argv = sys.argv # Do a little argument validation, in case someone tries to run us manually. if len(argv) < 4: if len(argv) < 1: sys.stderr.write( "Error: Toil worker invoked without its own name\n") sys.exit(1) else: sys.stderr.write( "Error: usage: %s JOB_NAME JOB_STORE_LOCATOR JOB_STORE_ID\n" % argv[0]) sys.exit(1) # Parse input args jobName = argv[1] jobStoreLocator = argv[2] jobStoreID = argv[3] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.resumeJobStore(jobStoreLocator) config = jobStore.config # Call the worker workerScript(jobStore, config, jobName, jobStoreID)
def setUp(self): super(JobWrapperTest, self).setUp() self.jobStorePath = self._getTestJobStorePath() parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args(args=[self.jobStorePath]) self.toil = Toil(options) self.assertEquals( self.toil, self.toil.__enter__() )
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) jobStore = Toil.resumeJobStore(options.jobStore) logger.info("Starting routine to kill running jobs in the toil workflow: %s" % options.jobStore) ####This behaviour is now broken batchSystem = Toil.createBatchSystem(jobStore.config) #This should automatically kill the existing jobs.. so we're good. for jobID in batchSystem.getIssuedBatchJobIDs(): #Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed")
def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] fasta = pyfasta.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} for dtype in ['BAM', 'INTRONBAM']: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome) # load the IsoSeq data, if we have any iso_seq_file_ids = [] if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), hints_args.annotation_gp) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), hints_args.protein_fasta) genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) input_file_ids = {'bams': bam_file_ids, 'iso_seq_bams': iso_seq_file_ids, 'annotation': annotation_file_id, 'protein_fasta': protein_fasta_file_id, 'genome_fasta': genome_fasta_file_id} if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def _getResultsFileName(self, toilPath): """ Get a path for the batch systems to store results. GridEngine, slurm, and LSF currently use this and only work if locator is file. """ # Use parser to extract the path and type locator, filePath = Toil.parseLocator(toilPath) assert locator == "file" return os.path.join(filePath, "results.txt")
def test_download_url(self): from toil_lib.urls import download_url A = Job.wrapJobFn(download_url, work_dir=self.tmpdir, url='www.google.com', name='testy') with Toil(self.options) as toil: toil.start(A) assert os.path.exists(os.path.join(self.tmpdir, 'testy'))
def toil_jobstore_info(jobstore: str) -> dict: """parses a toil jobstore folder""" try: jobStore = Toil.resumeJobStore(jobstore) except NoSuchJobStoreException: return {} else: stats = getStats(jobStore) return processData(jobStore.config, stats)
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Attempting to delete the job store") jobStore = Toil.getJobStore(options.jobStore) jobStore.destroy() logger.info("Successfully deleted the job store")
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help= "The location of the job store used by the workflow whose jobs should " "be killed." + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) config = Config() config.setOptions(options) config.jobStore = config.jobStore[5:] if config.jobStore.startswith( 'file:') else config.jobStore # ':' means an aws/google jobstore; use the old (broken?) method if ':' in config.jobStore: jobStore = Toil.resumeJobStore(config.jobStore) logger.info( "Starting routine to kill running jobs in the toil workflow: %s", config.jobStore) # TODO: This behaviour is now broken src: https://github.com/DataBiosphere/toil/commit/a3d65fc8925712221e4cda116d1825d4a1e963a1 batchSystem = Toil.createBatchSystem( jobStore.config ) # Should automatically kill existing jobs, so we're good. for jobID in batchSystem.getIssuedBatchJobIDs( ): # Just in case we do it again. batchSystem.killBatchJobs(jobID) logger.info("All jobs SHOULD have been killed") # otherwise, kill the pid recorded in the jobstore else: pid_log = os.path.join(os.path.abspath(config.jobStore), 'pid.log') with open(pid_log, 'r') as f: pid2kill = f.read().strip() try: os.kill(int(pid2kill), signal.SIGKILL) logger.info("Toil process %s successfully terminated." % str(pid2kill)) except OSError: logger.error("Toil process %s could not be terminated." % str(pid2kill)) raise
def testExportAfterFailedExport(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) try: with Toil(options) as toil: _ = toil.start(HelloWorld()) # oh no, an error! :( raise RuntimeError("we died after workflow completion but before our export finished") except RuntimeError: pass options.restart = True with Toil(options) as toil: fileID = toil.restart() print fileID # Hopefully the error didn't cause us to lose all our work! toil.exportFile(fileID, 'file://' + self.exportPath) with open(self.exportPath) as f: # The file should have all our content self.assertEquals(f.read(), "Hello, World!")
def __init__(self, jobStoreName, specifiedJobs=None): self.jobStoreName = jobStoreName self.jobStore = Toil.resumeJobStore(jobStoreName) if specifiedJobs is None: rootJob = self.fetchRootJob() logger.info('Traversing the job graph gathering jobs. This may take a couple of minutes.') self.jobsToReport = self.traverseJobGraph(rootJob) else: self.jobsToReport = self.fetchUserJobs(specifiedJobs)
def test_upload_and_download_with_encryption(self): from toil_lib.urls import s3am_upload from toil_lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key # Create temporary encryption key key_path = os.path.join(self.tmpdir, 'foo.key') subprocess.check_call([ 'dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path) ]) # Create test file upload_fpath = os.path.join(self.tmpdir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] A = Job.wrapJobFn(s3am_upload, fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) with Toil(self.options) as toil: toil.start(A) # Download the file B = Job.wrapJobFn(download_url, url=s3_url, name='download_file', work_dir=self.tmpdir, s3_key_path=key_path) with Toil(self.options) as toil: toil.start(B) download_fpath = os.path.join(self.tmpdir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def main(): args = cli() samples = parse_manifest(args.manifest) # Start Toil run with Toil(args) as toil: if not toil.options.restart: toil.start(Job.wrapJobFn(map_job, run_outlier_model, samples, args)) else: toil.restart()
def testResourceRequirements(self): """ Runs a trivial job that ensures that default and user specified resource requirements are actually used. """ options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') options.clean = 'always' options.logLevel = 'debug' with Toil(options) as toil: toil.start(Job.wrapJobFn(checkRequirements, memory='1000M'))
def testTempDir(self): """ test that job.tempDir works as expected and make use of job.log for logging """ message = "I love rachael price" options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) with Toil(options) as workflow: j = sillyTestJob(message) j.addChildJobFn(seriousTestJob, message) workflow.start(j)
def main(): opts = parse_args() with Toil(opts) as toil: if opts.restart: result_id = toil.restart() else: input_fasta_id = toil.importFile('file://' + os.path.abspath(opts.input_fasta)) job = Job.wrapJobFn(split_fasta_job, input_fasta_id, opts) result_id = toil.start(job) toil.exportFile(result_id, 'file://' + os.path.abspath(opts.output))
def main(): """ Reports stats on the workflow, use with --stats option to toil. """ parser = getBasicOptionParser() initializeOptions(parser) options = parseBasicOptions(parser) checkOptions(options, parser) jobStore = Toil.loadOrCreateJobStore(options.jobStore) stats = getStats(options) collatedStatsTag = processData(jobStore.config, stats, options) reportData(collatedStatsTag, options)
def main(): parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of the job store to delete. " + jobStoreLocatorHelp) parser.add_argument("--version", action='version', version=version) config = Config() config.setOptions(parseBasicOptions(parser)) logger.info("Attempting to delete the job store") jobStore = Toil.getJobStore(config.jobStore) jobStore.destroy() logger.info("Successfully deleted the job store")
def testImportSharedFileName(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" sharedFileName = 'someSharedFile' with Toil(options) as toil: srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4()) with open(srcFile, 'w') as f: f.write('some data') toil.importFile('file://' + srcFile, sharedFileName=sharedFileName) with toil._jobStore.readSharedFileStream(sharedFileName) as f: self.assertEquals(f.read().decode('utf-8'), 'some data')
def userScript(): import time import logging from toil.job import Job from toil.common import Toil log = logging.getLogger(__name__) def root(rootJob): def nullFile(): return rootJob.fileStore.jobStore.importFile( 'file:///dev/null') startFile = nullFile() endFile = nullFile() rootJob.addChildJobFn(deferring, startFile, endFile) encapsulatedJob = Job.wrapJobFn(encapsulated, startFile) encapsulatedJob.addChildFn(dummy) encapsulatedJob.addChildFn(dummy) encapsulatingJob = encapsulatedJob.encapsulate() rootJob.addChild(encapsulatingJob) encapsulatingJob.addChildJobFn(last, endFile) def dummy(): pass def deferred(): pass # noinspection PyUnusedLocal def deferring(job, startFile, endFile): job.defer(deferred) job.fileStore.jobStore.deleteFile(startFile) timeout = time.time() + 10 while job.fileStore.jobStore.fileExists(endFile): assert time.time() < timeout time.sleep(1) def encapsulated(job, startFile): timeout = time.time() + 10 while job.fileStore.jobStore.fileExists(startFile): assert time.time() < timeout time.sleep(1) def last(job, endFile): job.fileStore.jobStore.deleteFile(endFile) if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args( ) with Toil(options) as toil: rootJob = Job.wrapJobFn(root) toil.start(rootJob)
def testDAGConsistency(self): options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') options.clean = 'always' i = Job.wrapJobFn(parent) with Toil(options) as toil: try: toil.start(i) except FailedJobsException: # we expect this exception to be raised pass else: self.fail()
def workerCleanup(info): """ Cleans up the worker node on batch system shutdown. Also see :meth:`supportsWorkerCleanup`. :param WorkerCleanupInfo info: A named tuple consisting of all the relevant information for cleaning up the worker. """ assert isinstance(info, WorkerCleanupInfo) workflowDir = Toil.getWorkflowDir(info.workflowID, info.workDir) if (info.cleanWorkDir == 'always' or info.cleanWorkDir in ('onSuccess', 'onError') and os.listdir(workflowDir) == []): shutil.rmtree(workflowDir)
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def setupBinaries(options): """Ensure that Cactus's C/C++ components are ready to run, and set up the environment.""" if options.latest: os.environ["CACTUS_USE_LATEST"] = "1" if options.binariesMode is not None: # Mode is specified on command line mode = options.binariesMode else: # Might be specified through the environment, or not, in which # case the default is to use Docker. mode = os.environ.get("CACTUS_BINARIES_MODE", "docker") os.environ["CACTUS_BINARIES_MODE"] = mode if mode == "docker": # Verify Docker exists on the target system from distutils.spawn import find_executable if find_executable('docker') is None: raise RuntimeError("The `docker` executable wasn't found on the " "system. Please install Docker if possible, or " "use --binariesMode local and add cactus's bin " "directory to your PATH.") # If running without Docker, verify that we can find the Cactus executables elif mode == "local": from distutils.spawn import find_executable if find_executable('cactus_caf') is None: raise RuntimeError("Cactus isn't using Docker, but it can't find " "the Cactus binaries. Please add Cactus's bin " "directory to your PATH (and run `make` in the " "Cactus directory if you haven't already).") if find_executable('ktserver') is None: raise RuntimeError("Cactus isn't using Docker, but it can't find " "`ktserver`, the KyotoTycoon database server. " "Please install KyotoTycoon " "(https://github.com/alticelabs/kyoto) " "and add the binary to your PATH, or use the " "Docker mode.") else: assert mode == "singularity" jobStoreType, locator = Toil.parseLocator(options.jobStore) if jobStoreType != "file": raise RuntimeError( "Singularity mode is only supported when using the FileJobStore." ) if options.containerImage: imgPath = os.path.abspath(options.containerImage) os.environ["CACTUS_USE_LOCAL_SINGULARITY_IMG"] = "1" else: # When SINGULARITY_CACHEDIR is set, singularity will refuse to store images in the current directory if 'SINGULARITY_CACHEDIR' in os.environ: imgPath = os.path.join(os.environ['SINGULARITY_CACHEDIR'], "cactus.img") else: imgPath = os.path.join(os.path.abspath(locator), "cactus.img") os.environ["CACTUS_SINGULARITY_IMG"] = imgPath
def main(): usage = '%prog <halFile> <outputDirectory> [options]' parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('halfile', help='input hal file') parser.add_argument('outputDirectory', help='output directory') addOptions(parser) options = parser.parse_args() checkOptions(parser, options) with Toil(options) as toil: toil.start(Setup(options.halfile, options.outputDirectory, options))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("outputSequenceDir", help='Directory where the processed sequences will be placed') parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("inputSequences", nargs='+', help='input FASTA file(s)') options = parser.parse_args() setLoggingFromOptions(options) with Toil(options) as toil: stageWorkflow(outputSequenceDir=options.outputSequenceDir, configFile=options.configFile, inputSequences=options.inputSequences, toil=toil, restart=options.restart)
def main(): # Boilerplate -- startToil requires options parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() # Create object that contains our FileStoreIDs # Launch first toil Job i = Job.wrapJobFn(hello_world, memory=100, cores=0.5, disk="3G") with Toil(options) as toil: toil.start(i)
def main(): args = cli() samples = [ x.strip() for x in open(args.manifest, 'r').readlines() if not x.isspace() ] # Start Toil run with Toil(args) as toil: if not toil.options.restart: toil.start(Job.wrapJobFn(map_job, workflow, samples, args)) else: toil.restart()
def runAndTestDbServerService(dbElem, testFunc, inputSnapshotPath=None, outputSnapshotPath=None): dbTestJob = DbTestJob(dbElem, testFunc, inputSnapshotPath) options = Job.Runner.getDefaultOptions("./testDbServerWorkflow") options.logLevel = "INFO" options.clean = "always" with Toil(options) as toil: snapshotID = toil.start(dbTestJob) if outputSnapshotPath is not None: toil.exportFile(snapshotID, "file://" + outputSnapshotPath)
def testMultipleJobsPerWorkerStats(self): """ Tests case where multiple jobs are run on 1 worker to insure that all jobs report back their data """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.clean = "never" options.stats = True Job.Runner.startToil(RunTwoJobsPerWorker(), options) jobStore = Toil.loadOrCreateJobStore(options.jobStore) stats = getStats(options) collatedStats = processData(jobStore.config, stats, options) self.assertTrue(len(collatedStats.job_types) == 2, "Some jobs are not represented in the stats")
def main(): parser = getBasicOptionParser() parser.add_argument( "jobStore", type=str, help="The location of the job store used by the workflow." + jobStoreLocatorHelp) parser.add_argument("--localFilePath", nargs=1, help="Location to which to copy job store files.") parser.add_argument("--fetch", nargs="+", help="List of job-store files to be copied locally." "Use either explicit names (i.e. 'data.txt'), or " "specify glob patterns (i.e. '*.txt')") parser.add_argument( "--listFilesInJobStore", help="Prints a list of the current files in the jobStore.") parser.add_argument( "--fetchEntireJobStore", help="Copy all job store files into a local directory.") parser.add_argument( "--useSymlinks", help="Creates symlink 'shortcuts' of files in the localFilePath" " instead of hardlinking or copying, where possible. If this is" " not possible, it will copy the files (shutil.copyfile()).") parser.add_argument("--version", action='version', version=version) # Load the jobStore options = parseBasicOptions(parser) config = Config() config.setOptions(options) jobStore = Toil.resumeJobStore(config.jobStore) logger.debug("Connected to job store: %s", config.jobStore) if options.fetch: # Copy only the listed files locally logger.debug("Fetching local files: %s", options.fetch) fetchJobStoreFiles(jobStore=jobStore, options=options) elif options.fetchEntireJobStore: # Copy all jobStore files locally logger.debug("Fetching all local files.") options.fetch = "*" fetchJobStoreFiles(jobStore=jobStore, options=options) if options.listFilesInJobStore: # Log filenames and create a file containing these names in cwd printContentsOfJobStore(jobStorePath=options.jobStore)
def setupBinaries(options): """Ensure that Cactus's C/C++ components are ready to run, and set up the environment.""" if options.latest: os.environ["CACTUS_USE_LATEST"] = "1" if options.binariesMode is not None: # Mode is specified on command line mode = options.binariesMode else: # Might be specified through the environment, or not, in which # case the default is to use Docker. mode = os.environ.get("CACTUS_BINARIES_MODE", "docker") os.environ["CACTUS_BINARIES_MODE"] = mode if mode == "docker": # Verify Docker exists on the target system from distutils.spawn import find_executable if find_executable('docker') is None: raise RuntimeError("The `docker` executable wasn't found on the " "system. Please install Docker if possible, or " "use --binariesMode local and add cactus's bin " "directory to your PATH.") # If running without Docker, verify that we can find the Cactus executables elif mode == "local": from distutils.spawn import find_executable if find_executable('cactus_caf') is None: raise RuntimeError("Cactus isn't using Docker, but it can't find " "the Cactus binaries. Please add Cactus's bin " "directory to your PATH (and run `make` in the " "Cactus directory if you haven't already).") if find_executable('ktserver') is None: raise RuntimeError("Cactus isn't using Docker, but it can't find " "`ktserver`, the KyotoTycoon database server. " "Please install KyotoTycoon " "(https://github.com/alticelabs/kyoto) " "and add the binary to your PATH, or use the " "Docker mode.") else: assert mode == "singularity" jobStoreType, locator = Toil.parseLocator(options.jobStore) if jobStoreType != "file": raise RuntimeError("Singularity mode is only supported when using the FileJobStore.") if options.containerImage: imgPath = os.path.abspath(options.containerImage) os.environ["CACTUS_USE_LOCAL_SINGULARITY_IMG"] = "1" else: # When SINGULARITY_CACHEDIR is set, singularity will refuse to store images in the current directory if 'SINGULARITY_CACHEDIR' in os.environ: imgPath = os.path.join(os.environ['SINGULARITY_CACHEDIR'], "cactus.img") else: imgPath = os.path.join(os.path.abspath(locator), "cactus.img") os.environ["CACTUS_SINGULARITY_IMG"] = imgPath
def _runWorker(self, jobCommand, jobID, environment): """ Run the jobCommand using the worker and wait for it to finish. The worker is forked unless it is a '_toil_worker' job and debugWorker is True. """ startTime = time.time() # Time job is started if self.debugWorker and "_toil_worker" in jobCommand: # Run the worker without forking jobName, jobStoreLocator, jobStoreID = jobCommand.split()[ 1:] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) # TODO: The following does not yet properly populate self.runningJobs so it is not possible to kill # running jobs in forkless mode - see the "None" value in place of popen info = Info(time.time(), None, killIntended=False) try: self.runningJobs[jobID] = info try: toil_worker.workerScript( jobStore, jobStore.config, jobName, jobStoreID, redirectOutputToLogFile=not self.debugWorker ) # Call the worker finally: self.runningJobs.pop(jobID) finally: if not info.killIntended: self.outputQueue.put((jobID, 0, time.time() - startTime)) else: with self.popenLock: popen = subprocess.Popen(jobCommand, shell=True, env=dict(os.environ, **environment)) info = Info(time.time(), popen, killIntended=False) try: self.runningJobs[jobID] = info try: statusCode = popen.wait() if statusCode != 0 and not info.killIntended: log.error( "Got exit code %i (indicating failure) " "from job %s.", statusCode, self.jobs[jobID]) finally: self.runningJobs.pop(jobID) finally: if not info.killIntended: self.outputQueue.put( (jobID, statusCode, time.time() - startTime))
def augustus(args, coding_gp, toil_options): """ Main entry function for Augustus toil pipeline :param args: dictionary of arguments from CAT :param coding_gp: genePred with only coding transcripts :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.tm_cfg = FileID.forPath( t.importFile('file://' + args.tm_cfg), args.tm_cfg) input_file_ids.coding_gp = FileID.forPath( t.importFile('file://' + coding_gp), coding_gp) input_file_ids.ref_psl = FileID.forPath( t.importFile('file://' + args.ref_psl), args.ref_psl) input_file_ids.tm_psl = FileID.forPath( t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) input_file_ids.annotation_gp = FileID.forPath( t.importFile('file://' + args.annotation_gp), args.annotation_gp) file_ids = [ input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, input_file_ids.tm_psl, input_file_ids.annotation_gp ] if args.augustus_tmr: input_file_ids.augustus_hints_db = FileID.forPath( t.importFile('file://' + args.augustus_hints_db), args.augustus_hints_db) input_file_ids.tmr_cfg = FileID.forPath( t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) tm_file_id, tmr_file_id = t.start(job) else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def getStats(options): """ Collect and return the stats and config data. """ def aggregateStats(fileHandle,aggregateObject): try: stats = json.load(fileHandle, object_hook=Expando) for key in stats.keys(): if key in aggregateObject: aggregateObject[key].append(stats[key]) else: aggregateObject[key]=[stats[key]] except ValueError: logger.critical("File %s contains corrupted json. Skipping file." % fileHandle) pass # The file is corrupted. jobStore = Toil.loadOrCreateJobStore(options.jobStore) aggregateObject = Expando() callBack = partial(aggregateStats, aggregateObject=aggregateObject) jobStore.readStatsAndLogging(callBack, readAll=True) return aggregateObject
def main(): logging.basicConfig() ########################################## #Import necessary modules ########################################## # This is assuming that worker.py is at a path ending in "/toil/worker.py". sourcePath = os.path.dirname(os.path.dirname(__file__)) if sourcePath not in sys.path: sys.path.append(sourcePath) #Now we can import all the necessary functions from toil.lib.bioio import setLogLevel from toil.lib.bioio import getTotalCpuTime from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage from toil.lib.bioio import makePublicDir from toil.lib.bioio import system from toil.job import Job ########################################## #Input args ########################################## jobStoreString = sys.argv[1] jobStoreID = sys.argv[2] ########################################## #Load the jobStore/config file ########################################## jobStore = Toil.loadOrCreateJobStore(jobStoreString) config = jobStore.config ########################################## #Create the worker killer, if requested ########################################## if config.badWorker > 0 and random.random() < config.badWorker: def badWorker(): #This will randomly kill the worker process at a random time time.sleep(config.badWorkerFailInterval * random.random()) os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT) #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine) t = Thread(target=badWorker) # Ideally this would be a daemon thread but that causes an intermittent (but benign) # exception similar to the one described here: # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd # Our exception is: # Exception in thread Thread-1 (most likely raised during interpreter shutdown): # <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill' # This attribute error is caused by the call os.kill() and apparently unavoidable with a # daemon t.start() ########################################## #Load the environment for the jobWrapper ########################################## #First load the environment for the jobWrapper. with jobStore.readSharedFileStream("environment.pickle") as fileHandle: environment = cPickle.load(fileHandle) for i in environment: if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"): os.environ[i] = environment[i] # sys.path is used by __import__ to find modules if "PYTHONPATH" in environment: for e in environment["PYTHONPATH"].split(':'): if e != '': sys.path.append(e) setLogLevel(config.logLevel) toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir) ########################################## #Setup the temporary directories. ########################################## # Dir to put all this worker's temp files in. localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir) os.chmod(localWorkerTempDir, 0755) ########################################## #Setup the logging ########################################## #This is mildly tricky because we don't just want to #redirect stdout and stderr for this Python process; we want to redirect it #for this process and all children. Consequently, we can't just replace #sys.stdout and sys.stderr; we need to mess with the underlying OS-level #file descriptors. See <http://stackoverflow.com/a/11632982/402891> #When we start, standard input is file descriptor 0, standard output is #file descriptor 1, and standard error is file descriptor 2. #What file do we want to point FDs 1 and 2 to? tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt") #Save the original stdout and stderr (by opening new file descriptors to the #same files) origStdOut = os.dup(1) origStdErr = os.dup(2) #Open the file to send stdout/stderr to. logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND) #Replace standard output with a descriptor for the log file os.dup2(logFh, 1) #Replace standard error with a descriptor for the log file os.dup2(logFh, 2) #Since we only opened the file once, all the descriptors duped from the #original will share offset information, and won't clobber each others' #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't #matter, since O_APPEND seeks to the end of the file before every write, but #maybe there's something odd going on... #Close the descriptor we used to open the file os.close(logFh) for handler in list(logger.handlers): #Remove old handlers logger.removeHandler(handler) #Add the new handler. The sys.stderr stream has been redirected by swapping #the file descriptor out from under it. logger.addHandler(logging.StreamHandler(sys.stderr)) debugging = logging.getLogger().isEnabledFor(logging.DEBUG) ########################################## #Worker log file trapped from here on in ########################################## workerFailed = False statsDict = MagicExpando() statsDict.jobs = [] statsDict.workers.logsToMaster = [] blockFn = lambda : True cleanCacheFn = lambda x : True try: #Put a message at the top of the log, just to make sure it's working. print "---TOIL WORKER OUTPUT LOG---" sys.stdout.flush() #Log the number of open file descriptors so we can tell if we're leaking #them. logger.debug("Next available file descriptor: {}".format( nextOpenDescriptor())) # Setup the caching variable now in case of an exception during loading of jobwrapper, etc # Flag to identify if the run is cached or not. FileStore = Job.FileStore if config.disableSharedCache else Job.CachedFileStore ########################################## #Load the jobWrapper ########################################## jobWrapper = jobStore.load(jobStoreID) logger.debug("Parsed jobWrapper") ########################################## #Cleanup from any earlier invocation of the jobWrapper ########################################## if jobWrapper.command == None: # Cleanup jobs already finished f = lambda jobs : filter(lambda x : len(x) > 0, map(lambda x : filter(lambda y : jobStore.exists(y[0]), x), jobs)) jobWrapper.stack = f(jobWrapper.stack) jobWrapper.services = f(jobWrapper.services) logger.debug("Cleaned up any references to completed successor jobs") #This cleans the old log file which may #have been left if the jobWrapper is being retried after a jobWrapper failure. oldLogFile = jobWrapper.logJobStoreFileID if oldLogFile != None: jobWrapper.logJobStoreFileID = None jobStore.update(jobWrapper) #Update first, before deleting any files jobStore.deleteFile(oldLogFile) ########################################## # If a checkpoint exists, restart from the checkpoint ########################################## # The job is a checkpoint, and is being restarted after previously completing if jobWrapper.checkpoint != None: logger.debug("Job is a checkpoint") if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0 or jobWrapper.command != None: if jobWrapper.command != None: assert jobWrapper.command == jobWrapper.checkpoint logger.debug("Checkpoint job already has command set to run") else: jobWrapper.command = jobWrapper.checkpoint # Reduce the retry count assert jobWrapper.remainingRetryCount >= 0 jobWrapper.remainingRetryCount = max(0, jobWrapper.remainingRetryCount - 1) jobStore.update(jobWrapper) # Update immediately to ensure that checkpoint # is made before deleting any remaining successors if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0: # If the subtree of successors is not complete restart everything logger.debug("Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " % (jobWrapper.stack, jobWrapper.services)) # Delete everything on the stack, as these represent successors to clean # up as we restart the queue def recursiveDelete(jobWrapper2): # Recursive walk the stack to delete all remaining jobs for jobs in jobWrapper2.stack + jobWrapper2.services: for jobTuple in jobs: if jobStore.exists(jobTuple[0]): recursiveDelete(jobStore.load(jobTuple[0])) else: logger.debug("Job %s has already been deleted", jobTuple[0]) if jobWrapper2 != jobWrapper: logger.debug("Checkpoint is deleting old successor job: %s", jobWrapper2.jobStoreID) jobStore.delete(jobWrapper2.jobStoreID) recursiveDelete(jobWrapper) jobWrapper.stack = [ [], [] ] # Initialise the job to mimic the state of a job # that has been previously serialised but which as yet has no successors jobWrapper.services = [] # Empty the services # Update the jobStore to avoid doing this twice on failure and make this clean. jobStore.update(jobWrapper) # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean # because of the job being a checkpoint else: logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.") #Delete any remnant files map(jobStore.deleteFile, filter(jobStore.fileExists, jobWrapper.checkpointFilesToDelete)) ########################################## #Setup the stats, if requested ########################################## if config.stats: startTime = time.time() startClock = getTotalCpuTime() #Make a temporary file directory for the jobWrapper #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir")) startTime = time.time() while True: ########################################## #Run the jobWrapper, if there is one ########################################## if jobWrapper.command is not None: assert jobWrapper.command.startswith( "_toil " ) logger.debug("Got a command to run: %s" % jobWrapper.command) #Load the job job = Job._loadJob(jobWrapper.command, jobStore) # If it is a checkpoint job, save the command if job.checkpoint: jobWrapper.checkpoint = jobWrapper.command # Need to fix all this for non shared cache runs if config.disableSharedCache: #Cleanup the cache from the previous job cleanCacheFn(job.effectiveRequirements(jobStore.config).cache) # Create a fileStore object for the job fileStore = FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn) with job._executor(jobWrapper=jobWrapper, stats=statsDict if config.stats else None, fileStore=fileStore): with fileStore.open(job): # Get the next block function and list that will contain any messages blockFn = fileStore._blockFn job._runner(jobWrapper=jobWrapper, jobStore=jobStore, fileStore=fileStore) # Accumulate messages from this job & any subsequent chained jobs statsDict.workers.logsToMaster += fileStore.loggingMessages if config.disableSharedCache: #Set the clean cache function cleanCacheFn = fileStore._cleanLocalTempDir else: #The command may be none, in which case #the jobWrapper is either a shell ready to be deleted or has #been scheduled after a failure to cleanup break if FileStore._terminateEvent.isSet(): raise RuntimeError("The termination flag is set") ########################################## #Establish if we can run another jobWrapper within the worker ########################################## #If no more jobs to run or services not finished, quit if len(jobWrapper.stack) == 0 or len(jobWrapper.services) > 0 or jobWrapper.checkpoint != None: logger.debug("Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s", len(jobWrapper.stack), len(jobWrapper.services), jobWrapper.checkpoint != None) break #Get the next set of jobs to run jobs = jobWrapper.stack[-1] assert len(jobs) > 0 #If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: logger.debug("No more jobs can run in series by this worker," " it's got %i children", len(jobs)-1) break #We check the requirements of the jobWrapper to see if we can run it #within the current worker successorJobStoreID, successorMemory, successorCores, successorsDisk, successorsPreemptable, successorPredecessorID = jobs[0] if successorMemory > jobWrapper.memory: logger.debug("We need more memory for the next jobWrapper, so finishing") break if successorCores > jobWrapper.cores: logger.debug("We need more cores for the next jobWrapper, so finishing") break if successorsDisk > jobWrapper.disk: logger.debug("We need more disk for the next jobWrapper, so finishing") break if successorPredecessorID != None: logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.") break # Load the successor jobWrapper successorJobWrapper = jobStore.load(successorJobStoreID) # Somewhat ugly, but check if job is a checkpoint job and quit if # so if successorJobWrapper.command.startswith( "_toil " ): #Load the job successorJob = Job._loadJob(successorJobWrapper.command, jobStore) # Check it is not a checkpoint if successorJob.checkpoint: logger.debug("Next job is checkpoint, so finishing") break ########################################## #We have a single successor job that is not a checkpoint job. #We transplant the successor jobWrappers command and stack #into the current jobWrapper so that it can be run #as if it were a command that were part of the current jobWrapper. #We can then delete the successor jobWrapper in the jobStore, as it is #wholly incorporated into the current jobWrapper. ########################################## #Clone the jobWrapper and its stack jobWrapper = copy.deepcopy(jobWrapper) #Remove the successor jobWrapper jobWrapper.stack.pop() #These should all match up assert successorJobWrapper.memory == successorMemory assert successorJobWrapper.cores == successorCores assert successorJobWrapper.predecessorsFinished == set() assert successorJobWrapper.predecessorNumber == 1 assert successorJobWrapper.command != None assert successorJobStoreID == successorJobWrapper.jobStoreID #Transplant the command and stack to the current jobWrapper jobWrapper.command = successorJobWrapper.command jobWrapper.stack += successorJobWrapper.stack assert jobWrapper.memory >= successorJobWrapper.memory assert jobWrapper.cores >= successorJobWrapper.cores #Build a fileStore to update the job fileStore = Job.FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn) #Update blockFn blockFn = fileStore._blockFn #Add successorJobWrapper to those to be deleted fileStore.jobsToDelete.add(successorJobWrapper.jobStoreID) #This will update the job once the previous job is done fileStore._updateJobWhenDone() #Clone the jobWrapper and its stack again, so that updates to it do #not interfere with this update jobWrapper = copy.deepcopy(jobWrapper) logger.debug("Starting the next jobWrapper") ########################################## #Finish up the stats ########################################## if config.stats: totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage() statsDict.workers.time = str(time.time() - startTime) statsDict.workers.clock = str(totalCPUTime - startClock) statsDict.workers.memory = str(totalMemoryUsage) # log the worker log path here so that if the file is truncated the path can still be found logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir) logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime) ########################################## #Trapping where worker goes wrong ########################################## except: #Case that something goes wrong in worker traceback.print_exc() logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname()) FileStore._terminateEvent.set() ########################################## #Wait for the asynchronous chain of writes/updates to finish ########################################## blockFn() ########################################## #All the asynchronous worker/update threads must be finished now, #so safe to test if they completed okay ########################################## if FileStore._terminateEvent.isSet(): jobWrapper = jobStore.load(jobStoreID) jobWrapper.setupJobAfterFailure(config) workerFailed = True ########################################## #Cleanup ########################################## #Close the worker logging #Flush at the Python level sys.stdout.flush() sys.stderr.flush() #Flush at the OS level os.fsync(1) os.fsync(2) #Close redirected stdout and replace with the original standard output. os.dup2(origStdOut, 1) #Close redirected stderr and replace with the original standard error. os.dup2(origStdOut, 2) #sys.stdout and sys.stderr don't need to be modified at all. We don't need #to call redirectLoggerStreamHandlers since they still log to sys.stderr #Close our extra handles to the original standard output and standard error #streams, so we don't leak file handles. os.close(origStdOut) os.close(origStdErr) #Now our file handles are in exactly the state they were in before. #Copy back the log file to the global dir, if needed if workerFailed: jobWrapper.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobWrapper.jobStoreID) with jobStore.updateFileStream(jobWrapper.logJobStoreFileID) as w: with open(tempWorkerLogPath, "r") as f: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit: f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file w.write(f.read()) jobStore.update(jobWrapper) elif debugging: # write log messages with open(tempWorkerLogPath, 'r') as logFile: if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit: logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file logMessages = logFile.read().splitlines() statsDict.logs = [Expando(jobStoreID=jobStoreID, text=logMessage) for logMessage in logMessages] if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed: # We have stats/logging to report back jobStore.writeStatsAndLogging(json.dumps(statsDict)) #Remove the temp dir cleanUp = config.cleanWorkDir if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed): shutil.rmtree(localWorkerTempDir) #This must happen after the log file is done with, else there is no place to put the log if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0 and len(jobWrapper.services) == 0: # We can now safely get rid of the jobWrapper jobStore.delete(jobWrapper.jobStoreID)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) def traverseGraph(jobGraph): foundJobStoreIDs = set() totalJobs = [] def inner(jobGraph): if jobGraph.jobStoreID in foundJobStoreIDs: return foundJobStoreIDs.add(jobGraph.jobStoreID) totalJobs.append(jobGraph) # Traverse jobs in stack for jobs in jobGraph.stack: for successorJobStoreID in [x.jobStoreID for x in jobs]: if (successorJobStoreID not in foundJobStoreIDs and jobStore.exists(successorJobStoreID)): inner(jobStore.load(successorJobStoreID)) # Traverse service jobs for jobs in jobGraph.services: for serviceJobStoreID in [x.jobStoreID for x in jobs]: if jobStore.exists(serviceJobStoreID): assert serviceJobStoreID not in foundJobStoreIDs foundJobStoreIDs.add(serviceJobStoreID) totalJobs.append(jobStore.load(serviceJobStoreID)) inner(jobGraph) return totalJobs logger.info('Traversing the job graph. This may take a couple minutes.') totalJobs = traverseGraph(rootJob) failedJobs = [] hasChildren = [] hasServices = [] services = [] currentlyRunnning = [] for job in totalJobs: if job.logJobStoreFileID is not None: failedJobs.append(job) if job.stack: hasChildren.append(job) elif job.remainingRetryCount != 0 and job.logJobStoreFileID != 0 and job.command: # The job has no children, hasn't failed, and has a command to run. This indicates that the job is # likely currently running, or at least could be run. currentlyRunnning.append(job) if job.services: hasServices.append(job) if job.startJobStoreID or job.terminateJobStoreID or job.errorJobStoreID: # these attributes are only set in service jobs services.append(job) logger.info('There are %i unfinished jobs, %i parent jobs with children, %i jobs with services, %i services, ' 'and %i totally failed jobs currently in %s.' % (len(totalJobs), len(hasChildren), len(hasServices), len(services), len(failedJobs), config.jobStore)) if currentlyRunnning: logger.info('These %i jobs are currently active: %s', len(currentlyRunnning), ' \n'.join(map(str, currentlyRunnning))) if options.verbose: #Verbose currently means outputting the files that have failed. if failedJobs: msg = "Outputting logs for the %i failed jobs" % (len(failedJobs)) msg += ": %s" % ", ".join((str(failedJob) for failedJob in failedJobs)) for jobNode in failedJobs: job = jobStore.load(jobNode.jobStoreID) msg += "\n=========> Failed job %s \n" % jobNode with job.getLogFileHandle(jobStore) as fH: msg += fH.read() msg += "<=========\n" print(msg) else: print('There are no failed jobs to report.', file=sys.stderr) if totalJobs and options.failIfNotComplete: exit(1) # when the workflow is complete, all jobs will have been removed from job store
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help="The location of a job store that holds the information about the " "workflow whose status is to be reported on." + jobStoreLocatorHelp) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None config = Config() config.setOptions(options) ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.resumeJobStore(config.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobGraph we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), config.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def main(): """Reports the state of the toil. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " JobStoreCreationException exception will be thrown.")) parser.add_argument("--verbose", dest="verbose", action="store_true", help="Print loads of information, particularly all the log files of \ jobs that failed. default=%(default)s", default=False) parser.add_argument("--failIfNotComplete", dest="failIfNotComplete", action="store_true", help="Return exit value of 1 if toil jobs not all completed. default=%(default)s", default=False) parser.add_argument("--version", action='version', version=version) options = parseBasicOptions(parser) logger.info("Parsed arguments") if len(sys.argv) == 1: parser.print_help() sys.exit(0) ########################################## #Do some checks. ########################################## logger.info("Checking if we have files for Toil") assert options.jobStore is not None ########################################## #Survey the status of the job and report. ########################################## jobStore = Toil.loadOrCreateJobStore(options.jobStore) try: rootJob = jobStore.loadRootJob() except JobException: print('The root job of the job store is absent, the workflow completed successfully.', file=sys.stderr) sys.exit(0) toilState = ToilState(jobStore, rootJob ) # The first element of the toilState.updatedJobs tuple is the jobWrapper we want to inspect totalJobs = set(toilState.successorCounts.keys()) | \ {jobTuple[0] for jobTuple in toilState.updatedJobs} failedJobs = [ job for job in totalJobs if job.remainingRetryCount == 0 ] print('There are %i active jobs, %i parent jobs with children, and %i totally failed jobs ' 'currently in %s.' % (len(toilState.updatedJobs), len(toilState.successorCounts), len(failedJobs), options.jobStore), file=sys.stderr) if options.verbose: #Verbose currently means outputting the files that have failed. for job in failedJobs: if job.logJobStoreFileID is not None: with job.getLogFileHandle(jobStore) as logFileHandle: logStream(logFileHandle, job.jobStoreID, logger.warn) else: print('Log file for job %s is absent.' % job.jobStoreID, file=sys.stderr) if len(failedJobs) == 0: print('There are no failed jobs to report.', file=sys.stderr) if (len(toilState.updatedJobs) + len(toilState.successorCounts)) != 0 and \ options.failIfNotComplete: sys.exit(1)
def physicalDisk(config, toilWorkflowDir=None): if toilWorkflowDir is None: from toil.common import Toil toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir) diskStats = os.statvfs(toilWorkflowDir) return diskStats.f_frsize * diskStats.f_bavail