def __init__(self, tree, event, sleepTime, startTime, cpu): Job.__init__(self, cpu=cpu) self.tree = tree self.event = event self.sleepTime = sleepTime self.startTime = startTime self.cpu = cpu
def testJobConcurrency(self): """ Tests that the batch system is allocating core resources properly for concurrent tasks. """ for cores_per_job in self.allocated_cores: temp_dir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = temp_dir options.maxCores = self.cpu_count options.batchSystem = self.batchSystemName counter_path = os.path.join(temp_dir, 'counter') resetCounters(counter_path) value, max_value = getCounters(counter_path) assert (value, max_value) == (0, 0) root = Job() for _ in range(self.cpu_count): root.addFollowOn(Job.wrapFn(measureConcurrency, counter_path, self.sleep_time, cores=cores_per_job, memory='1M', disk='1Mi')) Job.Runner.startToil(root, options) _, max_value = getCounters(counter_path) self.assertEqual(max_value, self.cpu_count / cores_per_job)
def makeWorkflow(): job = Job() r1 = job.addService(TestServiceSerialization("woot1")) r2 = job.addService(TestServiceSerialization("woot2")) r3 = job.addService(TestServiceSerialization("woot3")) job.addChildFn(fnTest, [ r1, r2, r3 ], outFile) return job
def runNewCheckpointIsLeafVertexTest(self, createWorkflowFn): """ Test verification that a checkpoint job is a leaf vertex using both valid and invalid cases. :param createWorkflowFn: function to create and new workflow and return a tuple of: 0) the workflow root job 1) a checkpoint job to test within the workflow """ logger.info('Test checkpoint job that is a leaf vertex') self.runCheckpointVertexTest(*createWorkflowFn(), expectedException=None) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a service') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobService=TrivialService("LeafTestService"), expectedException=JobGraphDeadlockException) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a child job') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobChild=Job.wrapJobFn( simpleJobFn, "LeafTestChild"), expectedException=JobGraphDeadlockException) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobFollowOn=Job.wrapJobFn( simpleJobFn, "LeafTestFollowOn"), expectedException=JobGraphDeadlockException)
def __init__(self, tree, event, sleepTime, startTime, cores): Job.__init__(self, cores=cores) self.tree = tree self.event = event self.sleepTime = sleepTime self.startTime = startTime self.cores = cores
def testEncapsulation(self): """ Tests the Job.encapsulation method, which uses the EncapsulationJob class. """ # Temporary file outFile = getTempFile(rootDir=self._createTempDir()) try: # Encapsulate a job graph a = T.wrapJobFn(encapsulatedJobFn, "A", outFile) a = a.encapsulate() # Now add children/follow to the encapsulated graph d = T.wrapFn(f, a.rv(), outFile) e = T.wrapFn(f, d.rv(), outFile) a.addChild(d) a.addFollowOn(e) # Create the runner for the workflow. options = T.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" # Run the workflow, the return value being the number of failed jobs T.Runner.startToil(a, options) # Check output self.assertEquals(open(outFile, 'r').readline(), "ABCDE") finally: os.remove(outFile)
def testCacheEjection(self): """ Test cache always always ejects least recently created file """ # Makes three jobs that create an output file each which they write to filestore. The combined size of any two # files is always less that cacheSize but the combined size of all 3 is always more so 1 file always has to be # ejected. Test to ensure that A is always ejected regardless of size. # Make a temp directory for the test test_dir = self._createTempDir() for test in xrange(10): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" options.cacheSize = 100000 options.retryCount=100 options.badWorker=0.5 options.badWorkerFailInterval = 1.0 # Create a temp file to write teh test results handle, logfile = tempfile.mkstemp(dir=test_dir) os.close(handle) file_sizes = [50000, 40000, 30000] # Randomize to (potentially) test all combinations random.shuffle(file_sizes) # Run the workflow. A, B and C do teh cache operations, and D prints test status to tempFile A = Job.wrapJobFn(fileTestJob, file_sizes[0]) B = Job.wrapJobFn(fileTestJob, file_sizes[0]) C = Job.wrapJobFn(fileTestJob, file_sizes[0]) D = Job.wrapJobFn(fileTestCache, A.rv(), B.rv(), C.rv(), logfile) A.addChild(B) B.addChild(C) C.addChild(D) Job.Runner.startToil(A, options) # Assert jobs passed by reading test results from tempFile with open(logfile, 'r') as outfile: for test_status in outfile: assert test_status.strip() == 'True'
def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M') B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M') A.addChild(B) Job.Runner.startToil(A, self.options)
def main(): """Restarts a toil workflow. """ ########################################## #Construct the arguments. ########################################## parser = getBasicOptionParser() parser.add_argument("--version", action='version', version=version) parser.add_argument("jobStore", type=str, help=("Store in which to place job management files \ and the global accessed temporary files" "(If this is a file path this needs to be globally accessible " "by all machines running jobs).\n" "If the store already exists and restart is false an" " ExistingJobStoreException exception will be thrown.")) options = parseBasicOptions(parser) ########################################## #Now run the toil construction/leader ########################################## setLoggingFromOptions(options) options.restart = True with setupToil(options) as (config, batchSystem, jobStore): jobStore.clean(Job._loadRootJob(jobStore)) mainLoop(config, batchSystem, jobStore, Job._loadRootJob(jobStore))
def testEncapsulation(self): """ Tests the Job.encapsulation method, which uses the EncapsulationJob class. """ #Temporary file outFile = getTempFile(rootDir=os.getcwd()) #Make a job graph a = T.wrapFn(f, "A", outFile) b = a.addChildFn(f, a.rv(), outFile) c = a.addFollowOnFn(f, b.rv(), outFile) #Encapsulate it a = a.encapsulate() #Now add children/follow to the encapsulated graph d = T.wrapFn(f, c.rv(), outFile) e = T.wrapFn(f, d.rv(), outFile) a.addChild(d) a.addFollowOn(e) #Create the runner for the workflow. options = T.Runner.getDefaultOptions() options.logLevel = "INFO" #Run the workflow, the return value being the number of failed jobs self.assertEquals(T.Runner.startToil(a, options), 0) T.Runner.cleanup(options) #This removes the jobStore #Check output self.assertEquals(open(outFile, 'r').readline(), "ABCDE") #Cleanup os.remove(outFile)
def testPromiseRequirementRaceStatic(self): """ Checks for a race condition when using promised requirements and child job functions. """ A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024)) B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv())) A.addChild(B) Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
def testAddChildEncapsulate(self): """ Make sure that the encapsulate child does not have two pareents with unique roots. """ # Temporary file a = T.wrapFn(noOp) b = T.wrapFn(noOp) a.addChild(b).encapsulate() self.assertEquals(len(a.getRootJobs()), 1)
def testReadCachHitFileFromJobStore(self): """ Read a file from the file store that has a corresponding cached copy. Ensure the number of links on the file are appropriate. """ A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options)
def testControlledFailedWorkerRetry(self): """ Conduct a couple of job store operations. Then die. Ensure that the restarted job is tracking values in the cache state file appropriately. """ workdir = self._createTempDir(purpose='nonLocalDir') self.options.retryCount = 1 F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=2*1024*1024*1024, testDir=workdir, disk='2G') G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M') F.addChild(G) Job.Runner.startToil(F, self.options)
def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M') B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M') A.addChild(B) try: Job.Runner.startToil(A, self.options) except FailedJobsException as err: self.assertEqual(err.numberOfFailedJobs, 2) errMsg = self._parseAssertionError(self.options.logFile) if 'explicitly' not in errMsg: self.fail('Shouldn\'t see this')
def _testCacheMissFunction(self, cacheReadFile): """ This is the function that actually does what the 2 cache miss functions want. :param cacheReadFile: Does the read file need to be cached(T) or not(F) """ workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=False, nonLocalDir=workdir) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False, cacheReadFile=cacheReadFile, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options)
def testToilIsNotBroken(self): """ Runs a simple DAG to test if if any features other that caching were broken. """ A = Job.wrapJobFn(self._uselessFunc) B = Job.wrapJobFn(self._uselessFunc) C = Job.wrapJobFn(self._uselessFunc) D = Job.wrapJobFn(self._uselessFunc) A.addChild(B) A.addChild(C) B.addChild(D) C.addChild(D) Job.Runner.startToil(A, self.options)
def testServiceSerialization(self): """ Tests that a service can receive a promise without producing a serialization error. """ job = Job() service = TestServiceSerialization("woot") startValue = job.addService(service) # Add a first service to job subService = TestServiceSerialization(startValue) # Now create a child of # that service that takes the start value promise from the parent service job.addService(subService, parentService=service) # This should work if # serialization on services is working correctly. self.runToil(job)
def test_star(self): """ Test the functionality of align_dna """ univ_options = self._getTestUnivOptions() config_file = os.path.join(self._projectRootPath(), "src/protect/test/test_inputs/ci_parameters.yaml") test_src_folder = os.path.join(self._projectRootPath(), "src", "protect", "test") a = Job.wrapJobFn(self._get_test_star_files) b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate() c = Job.wrapJobFn(self._get_tool, b.rv(), "star") d = Job.wrapJobFn(align_rna, a.rv(), univ_options, c.rv()).encapsulate() a.addChild(b) b.addChild(c) c.addChild(d) Job.Runner.startToil(a, self.options)
def testJobFileStoreWithSmallCache(self, retryCount=0, badWorker=0.0, stringNo=1, stringLength=1000000, cacheSize=10000, testNo=2): """ Creates a chain of jobs, each reading and writing files using the Job.FileStore interface. Verifies the files written are always what we expect. The chain tests the caching behavior. """ for test in xrange(testNo): #Make a list of random strings, each of 100k chars and hash the first 200 #base prefix to the string def randomString(): chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" s = "".join(map(lambda i : random.choice(chars), xrange(stringLength))) return s[:PREFIX_LENGTH], s #Total length is 2 million characters (20 strings of length 100K each) testStrings = dict(map(lambda i : randomString(), xrange(stringNo))) options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" options.cacheSize = cacheSize options.retryCount=retryCount options.badWorker=badWorker options.badWorkerFailInterval = 1.0 chainLength = 10 # Run the workflow, the return value being the number of failed jobs Job.Runner.startToil(Job.wrapJobFn(fileTestJob, [], testStrings, chainLength), options)
def testWriteLocalFileToJobStore(self): """ Write a file from the localTempDir to the job store. Such a file will be cached by default. Ensure the file is cached. """ A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True) Job.Runner.startToil(A, self.options)
def test(self): """ Tests that a toil workflow that fails once can be resumed without a NoSuchJobException. """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" root = Job.wrapJobFn(parent) with self.assertRaises(FailedJobsException): # This one is intended to fail. Job.Runner.startToil(root, options) # Resume the workflow. Unfortunately, we have to check for # this bug using the logging output, since although the # NoSuchJobException causes the worker to fail, the batch # system code notices that the job has been deleted despite # the failure and avoids the failure. options.restart = True tempDir = self._createTempDir() options.logFile = os.path.join(tempDir, "log.txt") Job.Runner.startToil(root, options) with open(options.logFile) as f: logString = f.read() # We are looking for e.g. "Batch system is reporting that # the jobGraph with batch system ID: 1 and jobGraph # store ID: n/t/jobwbijqL failed with exit value 1" self.assertTrue("failed with exit value" not in logString)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--num-lines', default=1000, help='Number of lines in file to sort.', type=int) parser.add_argument('--line-length', default=50, help='Length of lines in file to sort.', type=int) parser.add_argument("--N", help="The threshold below which a serial sort function is used to sort file. " "All lines must of length less than or equal to N or program will fail", default=10000) options = parser.parse_args() if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) file_name = 'file_to_sort.txt' make_file_to_sort(file_name=file_name, lines=options.num_lines, line_length=options.line_length) with Toil(options) as toil: sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt') if not toil.options.restart: sort_file_id = toil.importFile(sort_file_url) sorted_file_id = toil.start(Job.wrapJobFn(setup, sort_file_id, int(options.N), False, memory='1000M')) else: sorted_file_id = toil.restart() toil.exportFile(sorted_file_id, sort_file_url)
def userScript(): from toil.job import Job from toil.common import Toil # A user-defined type, i.e. a type defined in the user script class X(object): pass # noinspection PyUnusedLocal def job(job, x, disk='10M', cores=1, memory='10M'): return x if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() x = X() with Toil(options) as toil: r = toil.start(Job.wrapJobFn(job, x).encapsulate()) # Assert that the return value is of type X, but not X from the __main__ # module but X from foo.bar, the canonical name for the user module. The # translation from __main__ to foo.bar is a side effect of hot-deployment. assert r.__class__ is not X import foo.bar assert r.__class__ is foo.bar.X # Assert that a copy was made. This is a side effect of pickling/unpickling. assert x is not r
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ if len(args) == 2 and args[1] == "--test": # Run the tests return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) options = parse_args(args) # This holds the nicely-parsed options object RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(collate_all, options, cores=1, memory="1G", disk="1G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) print("All jobs completed successfully") RealTimeLogger.stop_master()
def main(): """ This is a Toil pipeline for the UNC best practice RNA-Seq analysis. RNA-seq fastqs are combined, aligned, sorted, filtered, and quantified. Please read the README.md located in the same directory. """ # Define Parser object and add to toil parser = build_parser() Job.Runner.addToilOptions(parser) args = parser.parse_args() # Store inputs from argparse inputs = {'config': args.config, 'config_fastq': args.config_fastq, 'input': args.input, 'unc.bed': args.unc, 'hg19.transcripts.fa': args.fasta, 'composite_exons.bed': args.composite_exons, 'normalize.pl': args.normalize, 'output_dir': args.output_dir, 'rsem_ref.zip': args.rsem_ref, 'chromosomes.zip': args.chromosomes, 'ebwt.zip': args.ebwt, 'ssec': args.ssec, 's3_dir': args.s3_dir, 'sudo': args.sudo, 'single_end_reads': args.single_end_reads, 'upload_bam_to_s3': args.upload_bam_to_s3, 'uuid': None, 'sample.tar': None, 'cpu_count': None} # Launch jobs Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), args)
def main(): """ This is a Toil pipeline to transfer TCGA data into an S3 Bucket Data is pulled down with Genetorrent and transferred to S3 via S3AM. """ # Define Parser object and add to toil parser = build_parser() Job.Runner.addToilOptions(parser) args = parser.parse_args() # Store inputs from argparse inputs = {'genetorrent': args.genetorrent, 'genetorrent_key': args.genetorrent_key, 'ssec': args.ssec, 's3_dir': args.s3_dir} # Sanity checks if args.ssec: assert os.path.isfile(args.ssec) if args.genetorrent: assert os.path.isfile(args.genetorrent) if args.genetorrent_key: assert os.path.isfile(args.genetorrent_key) samples = parse_genetorrent(args.genetorrent) # Start pipeline # map_job accepts a function, an iterable, and *args. The function is launched as a child # process with one element from the iterable and *args, which in turn spawns a tree of child jobs. Job.Runner.startToil(Job.wrapJobFn(map_job, download_and_transfer_sample, samples, inputs), args)
def main(args): options = parse_args(args) RealTimeLogger.start_master() filtered_gams = [] skip_words = options.skip.split(",") for gam in options.in_gams: skip_gam = False for word in skip_words: if len(word) > 0 and word in gam: skip_gam = True if not skip_gam: filtered_gams.append(gam) options.in_gams = filtered_gams for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") # Make a root job root_job = Job.wrapJobFn(call_variants, options, cores=1, memory="2G", disk="2G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master()
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("--fileToSort", dest="fileToSort", help="The file you wish to sort") parser.add_argument("--N", dest="N", help="The threshold below which a serial sort function is" "used to sort file. All lines must of length less than or equal to N or program will fail", default=10000) options = parser.parse_args() if options.fileToSort is None: raise RuntimeError("No file to sort given") if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) #Now we are ready to run Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N), memory=sortMemory), options)
def align_transcripts(args, toil_options): """ Main entry function for transcript alignment toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, input_file_ids.ref_db] for mode in args.transcript_modes: input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.iteritems(): tools.fileOps.ensure_file_dir(file_path) t.exportFile(file_id, 'file://' + file_path)
def run_whole_alignment(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, indexes, reads_chunk_ids, bam_output=False, surject=False, gbwt_penalty=None, validate=False): """ align all fastq chunks in parallel Takes a dict from index type to index file ID. Some indexes are extra and specifying them will change mapping behavior. Returns a list of per-contig GAMs, the total allignment runtime, and a list of per-contig BAM file IDs (which is only nonempty when surject is true). """ # this will be a list of lists. # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) gam_chunk_file_ids = [] gam_chunk_running_times = [] # depending on bam_output and surject options, we can make bam_output too bam_chunk_file_ids = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph alignment on each fastq chunk chunk_alignment_job = child_job.addChildJobFn( run_chunk_alignment, context, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, chunk_filename_ids, chunk_id, indexes, bam_output=bam_output, gbwt_penalty=gbwt_penalty, validate=validate, cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) if not bam_output: gam_chunk_file_ids.append(chunk_alignment_job.rv(0)) else: bam_chunk_file_ids.append(chunk_alignment_job.rv(0)) gam_chunk_running_times.append(chunk_alignment_job.rv(1)) if not bam_output: merge_gams_job = child_job.addFollowOnJobFn( run_merge_gams, context, sample_name, indexes.get('id_ranges'), gam_chunk_file_ids, gam_chunk_running_times, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) gam_chrom_ids = merge_gams_job.rv(0) gam_chunk_time = merge_gams_job.rv(1) bam_chrom_ids = [] else: gam_chrom_ids = [] gam_chunk_time = None merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context, sample_name, bam_chunk_file_ids) bam_chrom_ids = [merge_bams_job.rv()] if surject: interleaved_surject = interleaved or (fastq and len(fastq) == 2) zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context, gam_chunk_file_ids) xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes[ 'xg'] bam_chrom_ids = [ zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(), sample_name + '-surject', interleaved_surject, xg_id, []).rv() ] return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
def __init__(self, magma_bin, batch_results): Job.__init__(self, memory="100M", cores=1, disk="100M") self.magma_bin = magma_bin # list of dicts containing file IDs from all gene tests self.batch_results = batch_results
def testNestedResourcesDoNotBlock(self): """ Resources are requested in the order Memory > Cpu > Disk. Test that inavailability of cpus for one job that is scheduled does not block another job that can run. """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir options.maxCores = 4 from toil import physicalMemory availableMemory = physicalMemory() options.batchSystem = self.batchSystemName outFile = os.path.join(tempDir, 'counter') open(outFile, 'w').close() root = Job() blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b', cores=2, memory='1M', disk='1M') firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ', cores=1, memory='1M', disk='1M') secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10, writeVal='sJ', cores=1, memory='1M', disk='1M') # Should block off 50% of memory while waiting for it's 3 cores firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0, writeVal='fJC', cores=3, memory=int(availableMemory/2), disk='1M') # These two shouldn't be able to run before B because there should be only # (50% of memory - 1M) available (firstJobChild should be blocking 50%) secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJC', cores=2, memory=int(availableMemory/1.5), disk='1M') secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJGC', cores=2, memory=int(availableMemory/1.5), disk='1M') root.addChild(blocker) root.addChild(firstJob) root.addChild(secondJob) firstJob.addChild(firstJobChild) secondJob.addChild(secondJobChild) secondJobChild.addChild(secondJobGrandChild) """ The tree is: root / | \ b fJ sJ | | fJC sJC | sJGC But the order of execution should be root > b , fJ, sJ > sJC > sJGC > fJC since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC should not block them, and should only run after they finish. """ Job.Runner.startToil(root, options) with open(outFile) as oFH: outString = oFH.read() # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same # time. We look for all possible permutations. possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])]) assert outString.startswith(possibleStarts) assert outString.endswith('sJCsJGCfJC')
def __init__(self, fileId): Job.__init__(self) self.fileId = fileId
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, checkpointInfo, doRenaming, pafInput, pafSecondaries, doVG, doGFA, delay=0, eventNameAsID=False, referenceEvent=None, pafMaskFilter=None): # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time time.sleep(delay) head_job = Job() job.addChild(head_job) # unzip the input sequences if necessary, and also extract paf masking beds preprocess_job = head_job.addChildJobFn(preprocess_input_sequences, configWrapper, project, cactusWorkflowArguments, pafMaskFilter, referenceEvent) no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs cactusWorkflowArguments = preprocess_job.rv(0) mask_beds = preprocess_job.rv(1) # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc # and the cigar files match up. If reading cactus-blast output, the cigars are fine, just need # the fastas (todo: make this less hacky somehow) cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming, eventNameAsID, mask_beds #todo disk= ) cactusWorkflowArguments = cur_job.rv(0) mask_bed_id = cur_job.rv(1) # allow for input in paf format: if pafInput: # convert the paf input to lastz format, splitting out into primary and secondary files # optionally apply the masking cur_job = cur_job.addFollowOnJobFn(mask_and_convert_paf, cactusWorkflowArguments, pafSecondaries, mask_bed_id) cactusWorkflowArguments = cur_job.rv() if no_ingroup_coverage: # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project) cactusWorkflowArguments = cur_job.rv() # run cactus setup all the way through cactus2hal generation setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments) # set up the project prepare_hal_export_job = setup_job.addFollowOnJobFn( run_prepare_hal_export, project, setup_job.rv()) # create the hal hal_export_job = prepare_hal_export_job.addFollowOnJobFn( exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1), checkpointInfo=checkpointInfo, acyclicEvent=referenceEvent, memory=configWrapper.getDefaultMemory(), disk=configWrapper.getExportHalDisk(), preemptable=False) # optionally create the VG if doVG or doGFA: vg_export_job = hal_export_job.addFollowOnJobFn( export_vg, hal_export_job.rv(), configWrapper, doVG, doGFA, checkpointInfo=checkpointInfo) vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1) else: vg_file_id, gfa_file_id = None, None return hal_export_job.rv(), vg_file_id, gfa_file_id
refDict_og = "human_g1k_b37_20.dict" refFasta = toil.importFile( "file:///home/lifeisaboutfishtacos/Desktop/wdl-tutorials/data/ref/human_g1k_b37_20.fasta" ) refFasta_og = "human_g1k_b37_20.fasta" gatk = toil.importFile( "file:///home/lifeisaboutfishtacos/Desktop/wdl-tutorials/GenomeAnalysisTK.jar" ) gatk_og = "GenomeAnalysisTK.jar" # Output Variables GVCF2_og = "GVCF2_rawLikelihoods.g.vcf" GVCF3_og = "GVCF3_rawLikelihoods.g.vcf" GVCF1_og = "GVCF1_rawLikelihoods.g.vcf" job0 = Job.wrapJobFn(initialize_jobs) job1 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta, refFasta_og, refIndex, refIndex_og, refDict, refDict_og, sample0_1, sample1_1, sample1_1_og, sample2_1, sample2_1_og) job2 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta, refFasta_og, refIndex, refIndex_og, refDict, refDict_og, sample0_2, sample1_2, sample1_2_og, sample2_2, sample2_2_og) job3 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta, refFasta_og, refIndex, refIndex_og, refDict, refDict_og, sample0_3, sample1_3, sample1_3_og, sample2_3, sample2_3_og) job4 = Job.wrapJobFn(GenotypeGVCFs, gatk, gatk_og, refFasta, refFasta_og, refIndex, refIndex_og, refDict, refDict_og, 'CEUtrio', job2.rv(), GVCF2_og,
def testNonCachingFileStore(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.disableCaching = True with Toil(options) as workflow: workflow.start(Job.wrapJobFn(simpleFileStoreJob))
def __init__(self): Job.__init__(self)
from toil.common import Toil from toil.job import Job def helloWorld(message, memory="1G", cores=1, disk="1G"): return "Hello, world!, here's a message: %s" % message if __name__ == "__main__": parser = Job.Runner.getDefaultArgumentParser() options = parser.parse_args() options.clean = "always" with Toil(options) as toil: output = toil.start(Job.wrapFn(helloWorld, "You did it!")) print(output)
from toil.common import Toil from toil.job import Job def helloWorld(message, memory="2G", cores=2, disk="3G"): return f"Hello, world!, here's a message: {message}" if __name__ == "__main__": options = Job.Runner.getDefaultOptions("./toilWorkflowRun") options.logLevel = "OFF" options.clean = "always" hello_job = Job.wrapFn(helloWorld, "Woot") with Toil(options) as toil: print(toil.start(hello_job)) # prints "Hello, world!, ..."
# Write another file using a stream; fileID2 is the # key for this second file. with job.fileStore.writeGlobalFileStream(cleanup=True) as (fH, fileID2): fH.write(b"Out brief candle") # Now read the first file; scratchFile2 is a local copy of the file that is read-only by default. scratchFile2 = job.fileStore.readGlobalFile(fileID) # Read the second file to a desired location: scratchFile3. scratchFile3 = os.path.join(job.tempDir, "foo.txt") job.fileStore.readGlobalFile(fileID2, userPath=scratchFile3) # Read the second file again using a stream. with job.fileStore.readGlobalFileStream(fileID2) as fH: print(fH.read()) # This prints "Out brief candle" # Delete the first file from the global file-store. job.fileStore.deleteGlobalFile(fileID) # It is unnecessary to delete the file keyed by fileID2 because we used the cleanup flag, # which removes the file after this job and all its successors have run (if the file still exists) if __name__ == "__main__": options = Job.Runner.getDefaultOptions("./toilWorkflowRun") options.logLevel = "INFO" options.clean = "always" with Toil(options) as toil: toil.start(Job.wrapJobFn(globalFileStoreJobFn))
def runCactusGraphMapSplit(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: split_id_map = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the contigs if any ref_contigs = set(options.refContigs) # todo: use import? if options.refContigsFile: with open(options.refContigsFile, 'r') as rc_file: for line in rc_file: if len(line.strip()): ref_contigs.add(line.strip().split()[0]) if options.otherContig: assert options.otherContig not in ref_contigs # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the paf paf_id = toil.importFile(makeURL(options.graphmapPAF)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([ seqFile.tree.getName(node) for node in seqFile.tree.getLeaves() ]) if graph_event not in leaves: raise RuntimeError( "Minigraph name {} not found in seqfile".format( graph_event)) if options.reference and options.reference not in leaves: raise RuntimeError( "Name given with --reference {} not found in seqfile". format(options.reference)) for genome, seq in seqFile.pathMap.items(): if genome in leaves: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow split_id_map = toil.start( Job.wrapJobFn(graphmap_split_workflow, options, config, seqIDMap, gfa_id, options.minigraphGFA, paf_id, options.graphmapPAF, ref_contigs, options.otherContig)) #export the split data export_split_data(toil, seqIDMap, split_id_map, options.outDir, config)
import argparse import os from toil.job import Job def f0(job): if 'FAIL' in os.environ: raise RuntimeError('failed on purpose') if __name__ == '__main__': parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') Job.Runner.startToil(rootJob, options)
def createWorkflow(): rootJob = Job.wrapJobFn(simpleJobFn, "Parent") childCheckpointJob = rootJob.addChildJobFn(simpleJobFn, "Child", checkpoint=True) return rootJob, childCheckpointJob
def testDockerClean(self, disableCaching=True, detached=True, rm=True, deferParam=None): """ Run the test container that creates a file in the work dir, and sleeps for 5 minutes. Ensure that the calling job gets SIGKILLed after a minute, leaving behind the spooky/ghost/zombie container. Ensure that the container is killed on batch system shutdown (through the deferParam mechanism). """ # We need to test the behaviour of `deferParam` with `rm` and # `detached`. We do not look at the case where `rm` and `detached` are # both True. This is the truth table for the different combinations at # the end of the test. R = Running, X = Does not exist, E = Exists but # not running. # None FORGO STOP RM # rm X R X X # detached R R E X # Neither R R E X data_dir = os.path.join(self.tempDir, 'data') working_dir = os.path.join(self.tempDir, 'working') test_file = os.path.join(working_dir, 'test.txt') mkdir_p(data_dir) mkdir_p(working_dir) options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, 'jobstore')) options.logLevel = self.dockerTestLogLevel options.workDir = working_dir options.clean = 'always' options.disableCaching = disableCaching # No base64 logic since it might create a name starting with a `-`. container_name = uuid.uuid4().hex A = Job.wrapJobFn(_testDockerCleanFn, working_dir, detached, rm, deferParam, container_name) try: Job.Runner.startToil(A, options) except FailedJobsException: # The file created by spooky_container would remain in the directory # and since it was created inside the container, it would have had # uid and gid == 0 (root) which may cause problems when docker # attempts to clean up the jobstore. file_stats = os.stat(test_file) assert file_stats.st_gid != 0 assert file_stats.st_uid != 0 if (rm and (deferParam != FORGO)) or deferParam == RM: # These containers should not exist assert containerIsRunning(container_name) is None, \ 'Container was not removed.' elif deferParam == STOP: # These containers should exist but be non-running assert containerIsRunning(container_name) == False, \ 'Container was not stopped.' else: # These containers will be running assert containerIsRunning(container_name) == True, \ 'Container was not running.' client = docker.from_env(version='auto') dockerKill(container_name, client) try: os.remove(test_file) except: pass
def test(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = 'INFO' root = Job.wrapJobFn(d) self.assertEqual(Job.Runner.startToil(root, options), ('b', 43, 3))
def userScript(): import os import time from toil.job import Job from toil.common import Toil from toil.leader import FailedJobsException TIMEOUT = 10 def root(rootJob): def nullFile(): return rootJob.fileStore.jobStore.importFile( 'file:///dev/null') startFile = nullFile() endFile = nullFile() rootJob.addChildJobFn(deferring, startFile, endFile) encapsulatedJob = Job.wrapJobFn(encapsulated, startFile) encapsulatedJob.addChildFn(dummy) encapsulatedJob.addChildFn(dummy) encapsulatedJob.addFollowOnJobFn(trigger, endFile) encapsulatingJob = encapsulatedJob.encapsulate() rootJob.addChild(encapsulatingJob) def dummy(): pass def deferredFile(config): """ Return path to a file at the root of the job store, exploiting the fact that the job store is shared between leader and worker container. """ prefix = 'file:' locator = config.jobStore assert locator.startswith(prefix) return os.path.join(locator[len(prefix):], 'testDeferredFile') def deferred(deferredFilePath): """ The deferred function that is supposed to run. """ os.unlink(deferredFilePath) # noinspection PyUnusedLocal def deferring(job, startFile, endFile): """ A job that adds the deferred function and then crashes once the `trigger` job tells it to. """ job.defer(deferred, deferredFile(job._config)) jobStore = job.fileStore.jobStore jobStore.deleteFile(startFile) with jobStore.updateFileStream(endFile) as fH: fH.write(str(os.getpid())) timeout = time.time() + TIMEOUT while jobStore.fileExists(endFile): assert time.time() < timeout time.sleep(1) os.kill(os.getpid(), 9) def encapsulated(job, startFile): """ A job that waits until the `deferring` job is running and waiting to be crashed. """ timeout = time.time() + TIMEOUT while job.fileStore.jobStore.fileExists(startFile): assert time.time() < timeout time.sleep(1) def trigger(job, endFile): """ A job that determines the PID of the worker running the `deferring` job, tells the `deferring` job to crash and then waits for the corresponding worker process to end. By waiting we can be sure that the `follow-on` job finds the left-overs of the `deferring` job. """ import errno jobStore = job.fileStore.jobStore with jobStore.readFileStream(endFile) as fH: pid = int(fH.read()) os.kill(pid, 0) jobStore.deleteFile(endFile) timeout = time.time() + TIMEOUT while True: try: os.kill(pid, 0) except OSError as e: if e.errno == errno.ESRCH: break else: raise else: assert time.time() < timeout time.sleep(1) def tryUnlink(deferredFilePath): try: os.unlink(deferredFilePath) except OSError as e: if e.errno == errno.ENOENT: pass else: raise if __name__ == '__main__': import errno options = Job.Runner.getDefaultArgumentParser().parse_args( ) with Toil(options) as toil: deferredFilePath = deferredFile(toil.config) open(deferredFilePath, 'w').close() try: assert os.path.exists(deferredFilePath) try: toil.start(Job.wrapJobFn(root)) except FailedJobsException as e: assert e.numberOfFailedJobs == 2 # `root` and `deferring` assert not os.path.exists(deferredFilePath), \ 'Apparently, the deferred function did not run.' else: assert False, 'Workflow should not have succeeded.' finally: tryUnlink(deferredFilePath)
def createWorkflow(): rootJob = Job.wrapJobFn(simpleJobFn, "Root", checkpoint=True) return rootJob, rootJob
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--pafMaskFilter", type=int, default=None, help= "softmasked (query) regions greather than this length will be removed from the input PAF before it is processed" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--reference", type=str, help= "Ensure that given genome is acyclic by deleting all paralogy edges in postprocessing, also do not mask its PAF mappings" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.reference: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.reference not in leaves: raise RuntimeError( "Genome specified with --reference, {}, not found in tree leaves" .format(options.reference)) if options.pafMaskFilter and not options.pafInput: raise RuntimeError("--pafMaskFilter can only be run with --pafInput") #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, referenceEvent=options.reference, pafMaskFilter=options.pafMaskFilter) return align_job
def nextChainableJobGraph(jobGraph, jobStore): """Returns the next chainable jobGraph after this jobGraph if one exists, or None if the chain must terminate. """ #If no more jobs to run or services not finished, quit if len(jobGraph.stack) == 0 or len( jobGraph.services) > 0 or jobGraph.checkpoint != None: logger.debug( "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s", len(jobGraph.stack), len(jobGraph.services), jobGraph.checkpoint != None) return None #Get the next set of jobs to run jobs = jobGraph.stack[-1] assert len(jobs) > 0 #If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: logger.debug( "No more jobs can run in series by this worker," " it's got %i children", len(jobs) - 1) return None #We check the requirements of the jobGraph to see if we can run it #within the current worker successorJobNode = jobs[0] if successorJobNode.memory > jobGraph.memory: logger.debug("We need more memory for the next job, so finishing") return None if successorJobNode.cores > jobGraph.cores: logger.debug("We need more cores for the next job, so finishing") return None if successorJobNode.disk > jobGraph.disk: logger.debug("We need more disk for the next job, so finishing") return None if successorJobNode.preemptable != jobGraph.preemptable: logger.debug( "Preemptability is different for the next job, returning to the leader" ) return None if successorJobNode.predecessorNumber > 1: logger.debug( "The jobGraph has multiple predecessors, we must return to the leader." ) return None # Load the successor jobGraph successorJobGraph = jobStore.load(successorJobNode.jobStoreID) # Somewhat ugly, but check if job is a checkpoint job and quit if # so if successorJobGraph.command.startswith("_toil "): #Load the job successorJob = Job._loadJob(successorJobGraph.command, jobStore) # Check it is not a checkpoint if successorJob.checkpoint: logger.debug("Next job is checkpoint, so finishing") return None # Made it through! This job is chainable. return successorJobGraph
def __init__(self, daner_file): Job.__init__(self, memory="100M", cores=1, disk="100M") self.daner_file = daner_file
def map_main(context, options): """ Wrapper for vg map. """ validate_map_options(context, options) # How long did it take to run the entire pipeline, in seconds? run_time_pipeline = None # Mark when we start the pipeline start_time_pipeline = timeit.default_timer() with context.get_toil(options.jobStore) as toil: if not toil.options.restart: importer = AsyncImporter(toil) # Make an index collection indexes = {} # Upload each index we have if options.xg_index is not None: indexes['xg'] = importer.load(options.xg_index) if options.gcsa_index is not None: indexes['gcsa'] = importer.load(options.gcsa_index) indexes['lcp'] = importer.load(options.gcsa_index + ".lcp") if options.gbwt_index is not None: indexes['gbwt'] = importer.load(options.gbwt_index) if options.distance_index is not None: indexes['distance'] = importer.load(options.distance_index) if options.minimizer_index is not None: indexes['minimizer'] = importer.load(options.minimizer_index) if options.snarls_index is not None: indexes['snarls'] = importer.load(options.snarls_index) if options.id_ranges is not None: indexes['id_ranges'] = importer.load(options.id_ranges) # Upload other local files to the remote IO Store inputReadsFileIDs = [] if options.fastq: for sample_reads in options.fastq: inputReadsFileIDs.append(importer.load(sample_reads)) elif options.gam_input_reads: inputReadsFileIDs.append(importer.load( options.gam_input_reads)) else: assert options.bam_input_reads inputReadsFileIDs.append(importer.load( options.bam_input_reads)) importer.wait() # Make a root job root_job = Job.wrapJobFn( run_mapping, context, options.fastq, options.gam_input_reads, options.bam_input_reads, options.sample_name, options.interleaved, options.mapper, importer.resolve(indexes), reads_file_ids=importer.resolve(inputReadsFileIDs), bam_output=options.bam_output, surject=options.surject, validate=options.validate, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) # Init the outstore init_job = Job.wrapJobFn(run_write_info_to_outstore, context, sys.argv, memory=context.config.misc_mem, disk=context.config.misc_disk) init_job.addFollowOn(root_job) # Run the job and store the returned list of output files to download toil.start(init_job) else: toil.restart() end_time_pipeline = timeit.default_timer() run_time_pipeline = end_time_pipeline - start_time_pipeline logger.info( "All jobs completed successfully. Pipeline took {} seconds.".format( run_time_pipeline))
def run_mapping(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, indexes, reads_file_ids=None, reads_chunk_ids=None, bam_output=False, surject=False, gbwt_penalty=None, validate=False): """ Split the fastq, then align each chunk. Exactly one of fastq, gam_input_reads, or bam_input_reads should be non-falsey, to indicate what kind of data the file IDs in reads_file_ids or reads_chunk_ids correspond to. Exactly one of reads_file_ids or read_chunks_ids should be specified. reads_file_ids holds a list of file IDs of non-chunked input read files, which will be chunked if necessary. reads_chunk_ids holds lists of chunk IDs for each read file, as produced by run_split_reads_if_needed. indexes is a dict from index type ('xg', 'gcsa', 'lcp', 'id_ranges', 'gbwt', 'minimizer', 'distance', 'snarls') to index file ID. Some indexes are extra and specifying them will change mapping behavior. Some indexes are required for certain values of mapper. mapper can be 'map', 'mpmap', or 'gaffe'. For 'map' and 'mpmap', the 'gcsa' and 'lcp' indexes are required. For 'gaffe', the 'gbwt', 'minimizer' and 'distance' indexes are required. All the mappers require the 'xg' index. If bam_output is set, produce BAMs. If surject is set, surject reads down to paths. If the 'gbwt' index is present and gbwt_penalty is specified, the default recombination penalty will be overridden. returns output gams, one per chromosome, the total mapping time (excluding toil-vg overhead such as transferring and splitting files), and output BAMs, one per chromosome, if computed. """ # Make sure we have exactly one type of input assert (bool(fastq) + bool(gam_input_reads) + bool(bam_input_reads) == 1) # Make sure we have exactly one kind of file IDs assert (bool(reads_file_ids) + bool(reads_chunk_ids) == 1) # We may have to have a job to chunk the reads chunk_job = None if reads_chunk_ids is None: # If the reads are not pre-chunked for us, we have to chunk them. chunk_job = job.addChildJobFn(run_split_reads_if_needed, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) reads_chunk_ids = chunk_job.rv() # We need a job to do the alignment align_job = Job.wrapJobFn(run_whole_alignment, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, indexes, reads_chunk_ids, bam_output=bam_output, surject=surject, gbwt_penalty=gbwt_penalty, validate=validate, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) if chunk_job is not None: # Alignment must happen after chunking chunk_job.addFollowOn(align_job) else: # Alignment can happen now job.addChild(align_job) return align_job.rv()
def __init__(self, inputFileID, failFileID): Job.__init__(self, memory=100000, cores=1, disk="1M") self.inputFileID = inputFileID self.failFileID = failFileID
def main_batch(): """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine. pros: much less chance of a problem with one chromosome affecting anything else more forgiving for inexact resource specs could be ported to Terra cons: less efficient use of resources """ parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("chromFile", help="chroms file") parser.add_argument("outHal", type=str, help="Output directory (can be s3://)") parser.add_argument( "--alignOptions", type=str, help= "Options to pass through to cactus-align (don't forget to wrap in quotes)" ) parser.add_argument("--alignCores", type=int, help="Number of cores per align job") parser.add_argument( "--alignCoresOverrides", nargs="*", help= "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected" ) parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) options = parser.parse_args() options.containerImage = None options.binariesMode = None options.root = None options.latest = None options.database = "kyoto_tycoon" setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # Turn the overrides into a dict cores_overrides = {} if options.alignCoresOverrides: for o in options.alignCoresOverrides: try: chrom, cores = o.split(',') cores_overrides[chrom] = int(cores) except: raise RuntimeError( "Error parsing alignCoresOverrides \"{}\"".format(o)) options.alignCoresOverrides = cores_overrides start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: config_id = toil.importFile(makeURL(options.configFile)) # load the chromfile into memory chrom_dict = {} with open(options.chromFile, 'r') as chrom_file: for line in chrom_file: toks = line.strip().split() if len(toks): assert len(toks) == 3 chrom, seqfile, alnFile = toks[0], toks[1], toks[2] chrom_dict[chrom] = toil.importFile( makeURL(seqfile)), toil.importFile( makeURL(alnFile)) results_dict = toil.start( Job.wrapJobFn(align_toil_batch, chrom_dict, config_id, options)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) toil.exportFile( results[3], makeURL( os.path.join(options.outHal, '{}.hal.log'.format(chrom)))) end_time = timeit.default_timer() run_time = end_time - start_time logger.info( "cactus-align-batch has finished after {} seconds".format(run_time))
job.fileStore.readGlobalFile(output_file, userPath=os.path.join(outputs_dir, "sample_" + output_num + "_" + output_filename)) return output_file if __name__ == "__main__": options = Job.Runner.getDefaultOptions("./toilWorkflowRun") options.logLevel = "INFO" options.clean = "always" with Toil(options) as toil: # specify the folder where the cwl and yml files live inputs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cwlExampleFiles") # specify where you wish the outputs to be written outputs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cwlExampleFiles") job0 = Job.wrapJobFn(initialize_jobs) cwl_filename = "hello.cwl" cwl_file = toil.importFile("file://" + os.path.abspath(os.path.join(inputs_dir, cwl_filename))) # add list of yml config inputs here or import and construct from file yml_files = ["hello1.yml", "hello2.yml", "hello3.yml"] i = 0 for yml in yml_files: i = i + 1 yml_file = toil.importFile("file://" + os.path.abspath(os.path.join(inputs_dir, yml))) yml_filename = yml job = Job.wrapJobFn(runQC, cwl_file, cwl_filename, yml_file, yml_filename, outputs_dir, output_num=str(i)) job0.addChild(job) toil.start(job0)
def workerScript(jobStore, config, jobName, jobStoreID, redirectOutputToLogFile=True): """ Worker process script, runs a job. :param str jobName: The "job name" (a user friendly name) of the job to be run :param str jobStoreLocator: Specifies the job store to use :param str jobStoreID: The job store ID of the job to be run :return int: 1 if a job failed, or 0 if all jobs succeeded """ logging.basicConfig() setLogLevel(config.logLevel) ########################################## #Create the worker killer, if requested ########################################## logFileByteReportLimit = config.maxLogFileSize if config.badWorker > 0 and random.random() < config.badWorker: # We need to kill the process we are currently in, to simulate worker # failure. We don't want to just send SIGKILL, because we can't tell # that from a legitimate OOM on our CI runner. We're going to send # SIGUSR1 so our terminations are distinctive, and then SIGKILL if that # didn't stick. We definitely don't want to do this from *within* the # process we are trying to kill, so we fork off. TODO: We can still # leave the killing code running after the main Toil flow is done, but # since it's now in a process instead of a thread, the main Python # process won't wait around for its timeout to expire. I think this is # better than the old thread-based way where all of Toil would wait # around to be killed. killTarget = os.getpid() sleepTime = config.badWorkerFailInterval * random.random() if os.fork() == 0: # We are the child # Let the parent run some amount of time time.sleep(sleepTime) # Kill it gently os.kill(killTarget, signal.SIGUSR1) # Wait for that to stick time.sleep(0.01) try: # Kill it harder. Hope the PID hasn't already been reused. # If we succeeded the first time, this will OSError os.kill(killTarget, signal.SIGKILL) except OSError: pass # Exit without doing any of Toil's cleanup os._exit(0) # We don't need to reap the child. Either it kills us, or we finish # before it does. Either way, init will have to clean it up for us. ########################################## #Load the environment for the jobGraph ########################################## #First load the environment for the jobGraph. with jobStore.readSharedFileStream("environment.pickle") as fileHandle: environment = safeUnpickleFromStream(fileHandle) env_blacklist = { "TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE", "HOME", "LOGNAME", "USER", "DISPLAY", "JAVA_HOME" } for i in environment: if i == "PATH": # Handle path specially. Sometimes e.g. leader may not include # /bin, but the Toil appliance needs it. if i in os.environ and os.environ[i] != '': # Use the provided PATH and then the local system's PATH os.environ[i] = environment[i] + ':' + os.environ[i] else: # Use the provided PATH only os.environ[i] = environment[i] elif i not in env_blacklist: os.environ[i] = environment[i] # sys.path is used by __import__ to find modules if "PYTHONPATH" in environment: for e in environment["PYTHONPATH"].split(':'): if e != '': sys.path.append(e) toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir) ########################################## #Setup the temporary directories. ########################################## # Dir to put all this worker's temp files in. localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir) os.chmod(localWorkerTempDir, 0o755) ########################################## #Setup the logging ########################################## #This is mildly tricky because we don't just want to #redirect stdout and stderr for this Python process; we want to redirect it #for this process and all children. Consequently, we can't just replace #sys.stdout and sys.stderr; we need to mess with the underlying OS-level #file descriptors. See <http://stackoverflow.com/a/11632982/402891> #When we start, standard input is file descriptor 0, standard output is #file descriptor 1, and standard error is file descriptor 2. # Do we even want to redirect output? Let the config make us not do it. redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture #What file do we want to point FDs 1 and 2 to? tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt") if redirectOutputToLogFile: # Announce that we are redirecting logging, and where it will now go. # This is important if we are trying to manually trace a faulty worker invocation. logger.info("Redirecting logging to %s", tempWorkerLogPath) sys.stdout.flush() sys.stderr.flush() # Save the original stdout and stderr (by opening new file descriptors # to the same files) origStdOut = os.dup(1) origStdErr = os.dup(2) # Open the file to send stdout/stderr to. logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND) # Replace standard output with a descriptor for the log file os.dup2(logFh, 1) # Replace standard error with a descriptor for the log file os.dup2(logFh, 2) # Since we only opened the file once, all the descriptors duped from # the original will share offset information, and won't clobber each # others' writes. See <http://stackoverflow.com/a/5284108/402891>. This # shouldn't matter, since O_APPEND seeks to the end of the file before # every write, but maybe there's something odd going on... # Close the descriptor we used to open the file os.close(logFh) debugging = logging.getLogger().isEnabledFor(logging.DEBUG) ########################################## #Worker log file trapped from here on in ########################################## jobAttemptFailed = False statsDict = MagicExpando() statsDict.jobs = [] statsDict.workers.logsToMaster = [] blockFn = lambda: True listOfJobs = [jobName] job = None try: #Put a message at the top of the log, just to make sure it's working. logger.info("---TOIL WORKER OUTPUT LOG---") sys.stdout.flush() logProcessContext(config) ########################################## #Connect to the deferred function system ########################################## deferredFunctionManager = DeferredFunctionManager(toilWorkflowDir) ########################################## #Load the jobGraph ########################################## jobGraph = jobStore.load(jobStoreID) listOfJobs[0] = str(jobGraph) logger.debug("Parsed job wrapper") ########################################## #Cleanup from any earlier invocation of the jobGraph ########################################## if jobGraph.command == None: logger.debug("Wrapper has no user job to run.") # Cleanup jobs already finished f = lambda jobs: [ z for z in [[y for y in x if jobStore.exists(y.jobStoreID)] for x in jobs] if len(z) > 0 ] jobGraph.stack = f(jobGraph.stack) jobGraph.services = f(jobGraph.services) logger.debug( "Cleaned up any references to completed successor jobs") #This cleans the old log file which may #have been left if the job is being retried after a job failure. oldLogFile = jobGraph.logJobStoreFileID if oldLogFile != None: jobGraph.logJobStoreFileID = None jobStore.update(jobGraph) #Update first, before deleting any files jobStore.deleteFile(oldLogFile) ########################################## # If a checkpoint exists, restart from the checkpoint ########################################## # The job is a checkpoint, and is being restarted after previously completing if jobGraph.checkpoint != None: logger.debug("Job is a checkpoint") # If the checkpoint still has extant jobs in its # (flattened) stack and services, its subtree didn't # complete properly. We handle the restart of the # checkpoint here, removing its previous subtree. if len([i for l in jobGraph.stack for i in l]) > 0 or len(jobGraph.services) > 0: logger.debug("Checkpoint has failed.") # Reduce the retry count assert jobGraph.remainingRetryCount >= 0 jobGraph.remainingRetryCount = max( 0, jobGraph.remainingRetryCount - 1) jobGraph.restartCheckpoint(jobStore) # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean # because of the job being a checkpoint else: logger.debug( "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete." ) #Delete any remnant files list( map( jobStore.deleteFile, list( filter(jobStore.fileExists, jobGraph.checkpointFilesToDelete)))) ########################################## #Setup the stats, if requested ########################################## if config.stats: startClock = getTotalCpuTime() startTime = time.time() while True: ########################################## #Run the jobGraph, if there is one ########################################## if jobGraph.command is not None: assert jobGraph.command.startswith("_toil ") logger.debug("Got a command to run: %s" % jobGraph.command) #Load the job job = Job._loadJob(jobGraph.command, jobStore) # If it is a checkpoint job, save the command if job.checkpoint: jobGraph.checkpoint = jobGraph.command # Create a fileStore object for the job fileStore = AbstractFileStore.createFileStore( jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) with job._executor(jobGraph=jobGraph, stats=statsDict if config.stats else None, fileStore=fileStore): with deferredFunctionManager.open() as defer: with fileStore.open(job): # Get the next block function to wait on committing this job blockFn = fileStore.waitForCommit job._runner(jobGraph=jobGraph, jobStore=jobStore, fileStore=fileStore, defer=defer) # When the job succeeds, start committing files immediately. fileStore.startCommit(jobState=False) # Accumulate messages from this job & any subsequent chained jobs statsDict.workers.logsToMaster += fileStore.loggingMessages else: #The command may be none, in which case #the jobGraph is either a shell ready to be deleted or has #been scheduled after a failure to cleanup logger.debug("No user job to run, so finishing") break if AbstractFileStore._terminateEvent.isSet(): raise RuntimeError("The termination flag is set") ########################################## #Establish if we can run another jobGraph within the worker ########################################## successorJobGraph = nextChainableJobGraph(jobGraph, jobStore) if successorJobGraph is None or config.disableChaining: # Can't chain any more jobs. # TODO: why don't we commit the last job's file store? Won't # its async uploads never necessarily finish? # If we do call startCommit here it messes with the job # itself and Toil thinks the job needs to run again. break ########################################## #We have a single successor job that is not a checkpoint job. #We transplant the successor jobGraph command and stack #into the current jobGraph object so that it can be run #as if it were a command that were part of the current jobGraph. #We can then delete the successor jobGraph in the jobStore, as it is #wholly incorporated into the current jobGraph. ########################################## # add the successor to the list of jobs run listOfJobs.append(str(successorJobGraph)) #Clone the jobGraph and its stack jobGraph = copy.deepcopy(jobGraph) #Remove the successor jobGraph jobGraph.stack.pop() #Transplant the command and stack to the current jobGraph jobGraph.command = successorJobGraph.command jobGraph.stack += successorJobGraph.stack # include some attributes for better identification of chained jobs in # logging output jobGraph.unitName = successorJobGraph.unitName jobGraph.jobName = successorJobGraph.jobName assert jobGraph.memory >= successorJobGraph.memory assert jobGraph.cores >= successorJobGraph.cores #Build a fileStore to update the job fileStore = AbstractFileStore.createFileStore( jobStore, jobGraph, localWorkerTempDir, blockFn, caching=not config.disableCaching) #Update blockFn blockFn = fileStore.waitForCommit #Add successorJobGraph to those to be deleted fileStore.jobsToDelete.add(successorJobGraph.jobStoreID) #This will update the job once the previous job is done fileStore.startCommit(jobState=True) #Clone the jobGraph and its stack again, so that updates to it do #not interfere with this update jobGraph = copy.deepcopy(jobGraph) logger.debug("Starting the next job") ########################################## #Finish up the stats ########################################## if config.stats: totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage() statsDict.workers.time = str(time.time() - startTime) statsDict.workers.clock = str(totalCPUTime - startClock) statsDict.workers.memory = str(totalMemoryUsage) # log the worker log path here so that if the file is truncated the path can still be found if redirectOutputToLogFile: logger.info( "Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir) logger.info( "Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime) ########################################## #Trapping where worker goes wrong ########################################## except: #Case that something goes wrong in worker traceback.print_exc() logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname()) AbstractFileStore._terminateEvent.set() ########################################## #Wait for the asynchronous chain of writes/updates to finish ########################################## blockFn() ########################################## #All the asynchronous worker/update threads must be finished now, #so safe to test if they completed okay ########################################## if AbstractFileStore._terminateEvent.isSet(): jobGraph = jobStore.load(jobStoreID) jobGraph.setupJobAfterFailure(config) jobAttemptFailed = True if job and jobGraph.remainingRetryCount == 0: job._succeeded = False ########################################## #Cleanup ########################################## # Close the worker logging # Flush at the Python level sys.stdout.flush() sys.stderr.flush() if redirectOutputToLogFile: # Flush at the OS level os.fsync(1) os.fsync(2) # Close redirected stdout and replace with the original standard output. os.dup2(origStdOut, 1) # Close redirected stderr and replace with the original standard error. os.dup2(origStdErr, 2) # sys.stdout and sys.stderr don't need to be modified at all. We don't # need to call redirectLoggerStreamHandlers since they still log to # sys.stderr # Close our extra handles to the original standard output and standard # error streams, so we don't leak file handles. os.close(origStdOut) os.close(origStdErr) # Now our file handles are in exactly the state they were in before. # Copy back the log file to the global dir, if needed. # Note that we work with bytes instead of characters so we can seek # relative to the end (since Python won't decode Unicode backward, or even # interpret seek offsets in characters for us). TODO: We may get invalid or # just different Unicode by breaking up a character at the boundary! if jobAttemptFailed and redirectOutputToLogFile: jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID( jobGraph.jobStoreID, cleanup=True) jobGraph.chainedJobs = listOfJobs with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w: with open(tempWorkerLogPath, 'rb') as f: if os.path.getsize( tempWorkerLogPath) > logFileByteReportLimit != 0: if logFileByteReportLimit > 0: f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: f.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file # Dump the possibly-invalid-Unicode bytes into the log file w.write(f.read()) # TODO load file using a buffer jobStore.update(jobGraph) elif ((debugging or (config.writeLogsFromAllJobs and not jobName.startswith(CWL_INTERNAL_JOBS))) and redirectOutputToLogFile): # write log messages with open(tempWorkerLogPath, 'rb') as logFile: if os.path.getsize( tempWorkerLogPath) > logFileByteReportLimit != 0: if logFileByteReportLimit > 0: logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file elif logFileByteReportLimit < 0: logFile.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file # Make sure lines are Unicode so they can be JSON serialized as part of the dict. # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters. logMessages = [ line.decode('utf-8', 'skip') for line in logFile.read().splitlines() ] statsDict.logs.names = listOfJobs statsDict.logs.messages = logMessages if (debugging or config.stats or statsDict.workers.logsToMaster ) and not jobAttemptFailed: # We have stats/logging to report back if USING_PYTHON2: jobStore.writeStatsAndLogging( json.dumps(statsDict, ensure_ascii=True)) else: jobStore.writeStatsAndLogging( json.dumps(statsDict, ensure_ascii=True).encode()) #Remove the temp dir cleanUp = config.cleanWorkDir if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed): shutil.rmtree(localWorkerTempDir) #This must happen after the log file is done with, else there is no place to put the log if (not jobAttemptFailed) and jobGraph.command == None and len( jobGraph.stack) == 0 and len(jobGraph.services) == 0: # We can now safely get rid of the jobGraph jobStore.delete(jobGraph.jobStoreID) if jobAttemptFailed: return 1 else: return 0
def main(options=None): if not options: # deal with command line arguments parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--numLines', default=defaultLines, help='Number of lines in file to sort.', type=int) parser.add_argument('--lineLength', default=defaultLineLen, help='Length of lines in file to sort.', type=int) parser.add_argument("--fileToSort", help="The file you wish to sort") parser.add_argument("--outputFile", help="Where the sorted output will go") parser.add_argument( "--overwriteOutput", help="Write over the output file if it already exists.", default=True) parser.add_argument( "--N", dest="N", help= "The threshold below which a serial sort function is used to sort file. " "All lines must of length less than or equal to N or program will fail", default=10000) parser.add_argument( '--downCheckpoints', action='store_true', help= 'If this option is set, the workflow will make checkpoints on its way through' 'the recursive "down" part of the sort') parser.add_argument( "--sortMemory", dest="sortMemory", help="Memory for jobs that sort chunks of the file.", default=None) parser.add_argument("--mergeMemory", dest="mergeMemory", help="Memory for jobs that collate results.", default=None) options = parser.parse_args() if not hasattr(options, "sortMemory") or not options.sortMemory: options.sortMemory = sortMemory if not hasattr(options, "mergeMemory") or not options.mergeMemory: options.mergeMemory = sortMemory # do some input verification sortedFileName = options.outputFile or "sortedFile.txt" if not options.overwriteOutput and os.path.exists(sortedFileName): print( "the output file {} already exists. Delete it to run the sort example again or use --overwriteOutput=True" .format(sortedFileName)) exit() fileName = options.fileToSort if options.fileToSort is None: # make the file ourselves fileName = 'fileToSort.txt' if os.path.exists(fileName): print("Sorting existing file: {}".format(fileName)) else: print( 'No sort file specified. Generating one automatically called: {}.' .format(fileName)) makeFileToSort(fileName=fileName, lines=options.numLines, lineLen=options.lineLength) else: if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) # Now we are ready to run with Toil(options) as workflow: sortedFileURL = 'file://' + os.path.abspath(sortedFileName) if not workflow.options.restart: sortFileURL = 'file://' + os.path.abspath(fileName) sortFileID = workflow.importFile(sortFileURL) sortedFileID = workflow.start( Job.wrapJobFn(setup, sortFileID, int(options.N), options.downCheckpoints, options=options, memory=sortMemory)) else: sortedFileID = workflow.restart() workflow.exportFile(sortedFileID, sortedFileURL)
def testUnicodeSupport(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.clean = 'always' options.logLevel = 'debug' Job.Runner.startToil(Job.wrapFn(printUnicodeCharacter), options)