def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M') B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M') A.addChild(B) Job.Runner.startToil(A, self.options)
def runNewCheckpointIsLeafVertexTest(self, createWorkflowFn): """ Test verification that a checkpoint job is a leaf vertex using both valid and invalid cases. :param createWorkflowFn: function to create and new workflow and return a tuple of: 0) the workflow root job 1) a checkpoint job to test within the workflow """ logger.info('Test checkpoint job that is a leaf vertex') self.runCheckpointVertexTest(*createWorkflowFn(), expectedException=None) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a service') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobService=TrivialService("LeafTestService"), expectedException=JobGraphDeadlockException) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a child job') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobChild=Job.wrapJobFn( simpleJobFn, "LeafTestChild"), expectedException=JobGraphDeadlockException) logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job') self.runCheckpointVertexTest(*createWorkflowFn(), checkpointJobFollowOn=Job.wrapJobFn( simpleJobFn, "LeafTestFollowOn"), expectedException=JobGraphDeadlockException)
def testCacheEjection(self): """ Test cache always always ejects least recently created file """ # Makes three jobs that create an output file each which they write to filestore. The combined size of any two # files is always less that cacheSize but the combined size of all 3 is always more so 1 file always has to be # ejected. Test to ensure that A is always ejected regardless of size. # Make a temp directory for the test test_dir = self._createTempDir() for test in xrange(10): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "DEBUG" options.cacheSize = 100000 options.retryCount=100 options.badWorker=0.5 options.badWorkerFailInterval = 1.0 # Create a temp file to write teh test results handle, logfile = tempfile.mkstemp(dir=test_dir) os.close(handle) file_sizes = [50000, 40000, 30000] # Randomize to (potentially) test all combinations random.shuffle(file_sizes) # Run the workflow. A, B and C do teh cache operations, and D prints test status to tempFile A = Job.wrapJobFn(fileTestJob, file_sizes[0]) B = Job.wrapJobFn(fileTestJob, file_sizes[0]) C = Job.wrapJobFn(fileTestJob, file_sizes[0]) D = Job.wrapJobFn(fileTestCache, A.rv(), B.rv(), C.rv(), logfile) A.addChild(B) B.addChild(C) C.addChild(D) Job.Runner.startToil(A, options) # Assert jobs passed by reading test results from tempFile with open(logfile, 'r') as outfile: for test_status in outfile: assert test_status.strip() == 'True'
def testPromiseRequirementRaceStatic(self): """ Checks for a race condition when using promised requirements and child job functions. """ A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024)) B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv())) A.addChild(B) Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
def testReadCachHitFileFromJobStore(self): """ Read a file from the file store that has a corresponding cached copy. Ensure the number of links on the file are appropriate. """ A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options)
def _testCacheMissFunction(self, cacheReadFile): """ This is the function that actually does what the 2 cache miss functions want. :param cacheReadFile: Does the read file need to be cached(T) or not(F) """ workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=False, nonLocalDir=workdir) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False, cacheReadFile=cacheReadFile, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options)
def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M') B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M') A.addChild(B) try: Job.Runner.startToil(A, self.options) except FailedJobsException as err: self.assertEqual(err.numberOfFailedJobs, 2) errMsg = self._parseAssertionError(self.options.logFile) if 'explicitly' not in errMsg: self.fail('Shouldn\'t see this')
def testControlledFailedWorkerRetry(self): """ Conduct a couple of job store operations. Then die. Ensure that the restarted job is tracking values in the cache state file appropriately. """ workdir = self._createTempDir(purpose='nonLocalDir') self.options.retryCount = 1 F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=2*1024*1024*1024, testDir=workdir, disk='2G') G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M') F.addChild(G) Job.Runner.startToil(F, self.options)
def testToilIsNotBroken(self): """ Runs a simple DAG to test if if any features other that caching were broken. """ A = Job.wrapJobFn(self._uselessFunc) B = Job.wrapJobFn(self._uselessFunc) C = Job.wrapJobFn(self._uselessFunc) D = Job.wrapJobFn(self._uselessFunc) A.addChild(B) A.addChild(C) B.addChild(D) C.addChild(D) Job.Runner.startToil(A, self.options)
def test_star(self): """ Test the functionality of align_dna """ univ_options = self._getTestUnivOptions() config_file = os.path.join(self._projectRootPath(), "src/protect/test/test_inputs/ci_parameters.yaml") test_src_folder = os.path.join(self._projectRootPath(), "src", "protect", "test") a = Job.wrapJobFn(self._get_test_star_files) b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate() c = Job.wrapJobFn(self._get_tool, b.rv(), "star") d = Job.wrapJobFn(align_rna, a.rv(), univ_options, c.rv()).encapsulate() a.addChild(b) b.addChild(c) c.addChild(d) Job.Runner.startToil(a, self.options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() filtered_gams = [] skip_words = options.skip.split(",") for gam in options.in_gams: skip_gam = False for word in skip_words: if len(word) > 0 and word in gam: skip_gam = True if not skip_gam: filtered_gams.append(gam) options.in_gams = filtered_gams for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") # Make a root job root_job = Job.wrapJobFn(call_variants, options, cores=1, memory="2G", disk="2G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master()
def test(self): """ Tests that a toil workflow that fails once can be resumed without a NoSuchJobException. """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" root = Job.wrapJobFn(parent) with self.assertRaises(FailedJobsException): # This one is intended to fail. Job.Runner.startToil(root, options) # Resume the workflow. Unfortunately, we have to check for # this bug using the logging output, since although the # NoSuchJobException causes the worker to fail, the batch # system code notices that the job has been deleted despite # the failure and avoids the failure. options.restart = True tempDir = self._createTempDir() options.logFile = os.path.join(tempDir, "log.txt") Job.Runner.startToil(root, options) with open(options.logFile) as f: logString = f.read() # We are looking for e.g. "Batch system is reporting that # the jobGraph with batch system ID: 1 and jobGraph # store ID: n/t/jobwbijqL failed with exit value 1" self.assertTrue("failed with exit value" not in logString)
def main(): parser = OptionParser() Job.Runner.addToilOptions(parser) parser.add_option("--fileToSort", dest="fileToSort", help="The file you wish to sort") parser.add_option("--N", dest="N", help="The threshold below which a serial sort function is" "used to sort file. All lines must of length less than or equal to N or program will fail", default=10000) options, args = parser.parse_args() if options.fileToSort is None: raise RuntimeError("No file to sort given") if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) if len(args) != 0: raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args)) #Now we are ready to run i = Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N)), options) if i: raise RuntimeError("The toil contained %i failed jobs" % i)
def main(args): parser = build_parser() Job.Runner.addToilOptions(parser) options = parser.parse_args() if bool(options.master_ip) == bool(options.num_nodes): raise ValueError("Only one of --master_ip (%s) and --num_nodes (%d) can be provided." % (options.master_ip, options.num_nodes)) if options.num_nodes <= 1: raise ValueError("--num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater " "than 1. %d was passed." % options.num_nodes) inputs = {'numWorkers': options.num_nodes - 1, 'outDir': options.output_directory, 'bamName': options.input_file_name, 'knownSNPs': options.known_SNPs, 'driverMemory': options.driver_memory, 'executorMemory': options.executor_memory, 'sudo': options.sudo, 'suffix': None, 'masterIP': options.master_ip} Job.Runner.startToil(Job.wrapJobFn(static_adam_preprocessing_dag, inputs), options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") robust_makedirs(json_out_path(options)) robust_makedirs(compare_out_path(options)) # Make a root job root_job = Job.wrapJobFn(compute_all_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # make some tables from the json comparison output #dist_table(options) #acc_table(options) snp_count_table(options) graph_size_table(options)
def main(): """ This is a Toil pipeline used to perform variant analysis (usually on exomes) from Tumor/Normal BAMs. All samples are co-cleaned (GATK Indel Realignment (IR) and Base Quality Score Recalibration (BQSR)) before variant analysis is performed by MuTect. The final output of this pipeline is a tarball containing the output of MuTect (.vcf, .cov, .out). Please see the associated README.md for an overview and quickstart walkthrough. """ # Define Parser object and add to jobTree argparser = build_parser() Job.Runner.addToilOptions(argparser) pargs = argparser.parse_args() # Variables to pass to initial job inputs = { "ref.fasta": pargs.reference, "config": pargs.config, "phase.vcf": pargs.phase, "mills.vcf": pargs.mills, "dbsnp.vcf": pargs.dbsnp, "cosmic.vcf": pargs.cosmic, "output_dir": pargs.output_dir, "ssec": pargs.ssec, "s3_dir": pargs.s3_dir, "sudo": pargs.sudo, "uuid": None, "normal.bam": None, "tumor.bam": None, "cpu_count": None, } # Launch Pipeline Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), pargs)
def align_transcripts(args, toil_options): """ Main entry function for transcript alignment toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, input_file_ids.ref_db] for mode in args.transcript_modes: input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.iteritems(): tools.fileOps.ensure_file_dir(file_path) t.exportFile(file_id, 'file://' + file_path)
def test_mhc_assessment(self): """ Test the functionality of assess_mhc_genes """ univ_options = self._getTestUnivOptions() test_src_folder = os.path.join(self._projectRootPath(), 'src', 'protect', 'test') a = Job.wrapJobFn(self._get_test_rsem_file, test_src_folder) b = Job.wrapJobFn(self._get_MHC_file) c = Job.wrapJobFn(self._get_test_haplotype_file, test_src_folder) d = Job.wrapJobFn(assess_mhc_genes, a.rv(), c.rv(), univ_options, b.rv()) e = Job.wrapJobFn(self._test_output, d.rv(), univ_options) a.addChild(b) b.addChild(c) c.addChild(d) d.addChild(e) Job.Runner.startToil(a, self.options)
def main(): """ This is a Toil pipeline to transfer TCGA data into an S3 Bucket Data is pulled down with Genetorrent and transferred to S3 via S3AM. """ # Define Parser object and add to toil parser = build_parser() Job.Runner.addToilOptions(parser) args = parser.parse_args() # Store inputs from argparse inputs = {'genetorrent': args.genetorrent, 'genetorrent_key': args.genetorrent_key, 'ssec': args.ssec, 's3_dir': args.s3_dir, 'sudo': args.sudo} # Sanity checks if args.ssec: assert os.path.isfile(args.ssec) if args.genetorrent: assert os.path.isfile(args.genetorrent) if args.genetorrent_key: assert os.path.isfile(args.genetorrent_key) # Start Pipeline Job.Runner.startToil(Job.wrapJobFn(start_batch, inputs), args)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--num-lines', default=1000, help='Number of lines in file to sort.', type=int) parser.add_argument('--line-length', default=50, help='Length of lines in file to sort.', type=int) parser.add_argument("--N", help="The threshold below which a serial sort function is used to sort file. " "All lines must of length less than or equal to N or program will fail", default=10000) options = parser.parse_args() if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) file_name = 'file_to_sort.txt' make_file_to_sort(file_name=file_name, lines=options.num_lines, line_length=options.line_length) with Toil(options) as toil: sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt') if not toil.options.restart: sort_file_id = toil.importFile(sort_file_url) sorted_file_id = toil.start(Job.wrapJobFn(setup, sort_file_id, int(options.N), False, memory='1000M')) else: sorted_file_id = toil.restart() toil.exportFile(sorted_file_id, sort_file_url)
def main(args): options = parse_args(args) RealTimeLogger.start_master() for graph in options.graphs: if os.path.splitext(graph)[1] != ".vg": raise RuntimeError("Input graphs expected to have .vg extension") # Make a root job root_job = Job.wrapJobFn(compute_kmer_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # Do the drawing outside toil to get around weird import problems cluster_comparisons(options)
def main(): """ This is a Toil pipeline for the UNC best practice RNA-Seq analysis. RNA-seq fastqs are combined, aligned, sorted, filtered, and quantified. Please read the README.md located in the same directory. """ # Define Parser object and add to toil parser = build_parser() Job.Runner.addToilOptions(parser) args = parser.parse_args() # Store inputs from argparse inputs = {'config': args.config, 'config_fastq': args.config_fastq, 'input': args.input, 'unc.bed': args.unc, 'hg19.transcripts.fa': args.fasta, 'composite_exons.bed': args.composite_exons, 'normalize.pl': args.normalize, 'output_dir': args.output_dir, 'rsem_ref.zip': args.rsem_ref, 'chromosomes.zip': args.chromosomes, 'ebwt.zip': args.ebwt, 'ssec': args.ssec, 's3_dir': args.s3_dir, 'sudo': args.sudo, 'single_end_reads': args.single_end_reads, 'upload_bam_to_s3': args.upload_bam_to_s3, 'uuid': None, 'sample.tar': None, 'cpu_count': None} # Launch jobs Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), args)
def testJobFileStoreWithSmallCache(self, retryCount=0, badWorker=0.0, stringNo=1, stringLength=1000000, cacheSize=10000, testNo=2): """ Creates a chain of jobs, each reading and writing files using the Job.FileStore interface. Verifies the files written are always what we expect. The chain tests the caching behavior. """ for test in xrange(testNo): #Make a list of random strings, each of 100k chars and hash the first 200 #base prefix to the string def randomString(): chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" s = "".join(map(lambda i : random.choice(chars), xrange(stringLength))) return s[:PREFIX_LENGTH], s #Total length is 2 million characters (20 strings of length 100K each) testStrings = dict(map(lambda i : randomString(), xrange(stringNo))) options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" options.cacheSize = cacheSize options.retryCount=retryCount options.badWorker=badWorker options.badWorkerFailInterval = 1.0 chainLength = 10 # Run the workflow, the return value being the number of failed jobs Job.Runner.startToil(Job.wrapJobFn(fileTestJob, [], testStrings, chainLength), options)
def main(): """ This is a Toil pipeline to transfer TCGA data into an S3 Bucket Data is pulled down with Genetorrent and transferred to S3 via S3AM. """ # Define Parser object and add to toil parser = build_parser() Job.Runner.addToilOptions(parser) args = parser.parse_args() # Store inputs from argparse inputs = {'genetorrent': args.genetorrent, 'genetorrent_key': args.genetorrent_key, 'ssec': args.ssec, 's3_dir': args.s3_dir} # Sanity checks if args.ssec: assert os.path.isfile(args.ssec) if args.genetorrent: assert os.path.isfile(args.genetorrent) if args.genetorrent_key: assert os.path.isfile(args.genetorrent_key) samples = parse_genetorrent(args.genetorrent) # Start pipeline # map_job accepts a function, an iterable, and *args. The function is launched as a child # process with one element from the iterable and *args, which in turn spawns a tree of child jobs. Job.Runner.startToil(Job.wrapJobFn(map_job, download_and_transfer_sample, samples, inputs), args)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("--fileToSort", dest="fileToSort", help="The file you wish to sort") parser.add_argument("--N", dest="N", help="The threshold below which a serial sort function is" "used to sort file. All lines must of length less than or equal to N or program will fail", default=10000) options = parser.parse_args() if options.fileToSort is None: raise RuntimeError("No file to sort given") if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) #Now we are ready to run Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N), memory=sortMemory), options)
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ if len(args) == 2 and args[1] == "--test": # Run the tests return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) options = parse_args(args) # This holds the nicely-parsed options object RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(collate_all, options, cores=1, memory="1G", disk="1G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) print("All jobs completed successfully") RealTimeLogger.stop_master()
def userScript(): from toil.job import Job from toil.common import Toil # A user-defined type, i.e. a type defined in the user script class X(object): pass # noinspection PyUnusedLocal def job(job, x, disk='10M', cores=1, memory='10M'): return x if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() x = X() with Toil(options) as toil: r = toil.start(Job.wrapJobFn(job, x).encapsulate()) # Assert that the return value is of type X, but not X from the __main__ # module but X from foo.bar, the canonical name for the user module. The # translation from __main__ to foo.bar is a side effect of hot-deployment. assert r.__class__ is not X import foo.bar assert r.__class__ is foo.bar.X # Assert that a copy was made. This is a side effect of pickling/unpickling. assert x is not r
def testEncapsulation(self): """ Tests the Job.encapsulation method, which uses the EncapsulationJob class. """ # Temporary file outFile = getTempFile(rootDir=self._createTempDir()) try: # Encapsulate a job graph a = T.wrapJobFn(encapsulatedJobFn, "A", outFile) a = a.encapsulate() # Now add children/follow to the encapsulated graph d = T.wrapFn(f, a.rv(), outFile) e = T.wrapFn(f, d.rv(), outFile) a.addChild(d) a.addFollowOn(e) # Create the runner for the workflow. options = T.Runner.getDefaultOptions(self._getTestJobStorePath()) options.logLevel = "INFO" # Run the workflow, the return value being the number of failed jobs T.Runner.startToil(a, options) # Check output self.assertEquals(open(outFile, 'r').readline(), "ABCDE") finally: os.remove(outFile)
def testWriteLocalFileToJobStore(self): """ Write a file from the localTempDir to the job store. Such a file will be cached by default. Ensure the file is cached. """ A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True) Job.Runner.startToil(A, self.options)
def testDeleteLocalFile(self): """ Test the deletion capabilities of deleteLocalFile """ self.options.retryCount = 0 workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._deleteLocalFileFn, nonLocalDir=workdir) Job.Runner.startToil(A, self.options)
def testTrivialDAGConsistency(self): options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') options.clean = 'always' options.logLevel = 'debug' i = Job.wrapJobFn(trivialParent) with Toil(options) as toil: try: toil.start(i) except FailedJobsException: # we expect this exception to be raised pass else: self.fail()
def augustus(args, coding_gp, toil_options): """ Main entry function for Augustus toil pipeline :param args: dictionary of arguments from CAT :param coding_gp: genePred with only coding transcripts :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.tm_cfg = FileID.forPath( t.importFile('file://' + args.tm_cfg), args.tm_cfg) input_file_ids.coding_gp = FileID.forPath( t.importFile('file://' + coding_gp), coding_gp) input_file_ids.ref_psl = FileID.forPath( t.importFile('file://' + args.ref_psl), args.ref_psl) input_file_ids.tm_psl = FileID.forPath( t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) input_file_ids.annotation_gp = FileID.forPath( t.importFile('file://' + args.annotation_gp), args.annotation_gp) file_ids = [ input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, input_file_ids.tm_psl, input_file_ids.annotation_gp ] if args.augustus_tmr: input_file_ids.augustus_hints_db = FileID.forPath( t.importFile('file://' + args.augustus_hints_db), args.augustus_hints_db) input_file_ids.tmr_cfg = FileID.forPath( t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) tm_file_id, tmr_file_id = t.start(job) else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--reference', required=True) parser.add_argument('--target', required=True) parser.add_argument('--chunk-size', default=500, type=int) parser.add_argument('--out-psl', required=True) parser.add_argument('--ooc') args = parser.parse_args() r = Job.Runner.startToil(Job.wrapJobFn(setup, os.path.abspath(args.reference), os.path.abspath(args.target), args.chunk_size, args.ooc, memory='4G'), args) with open(args.out_psl, 'w') as outf: outf.write(r)
def _testMultipleJobsReadGlobalFileFunction(self, cacheHit): """ This function does what the two Multiple File reading tests want to do :param bool cacheHit: Is the test for the CacheHit case(T) or cacheMiss case(F) """ dirPurpose = 'tempWriteDir' if cacheHit else 'nonLocalDir' workdir = self._createTempDir(purpose=dirPurpose) with open(os.path.join(workdir, 'test'), 'w') as x: x.write(str(0)) A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=cacheHit, nonLocalDir=workdir, fileMB=256) B = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M') jobs = {} for i in xrange(0, 10): jobs[i] = Job.wrapJobFn(self._multipleFileReader, diskMB=1024, fsID=A.rv(), maxWriteFile=os.path.abspath(x.name), disk='1G', memory='10M', cores=1) A.addChild(jobs[i]) jobs[i].addChild(B) Job.Runner.startToil(A, self.options) with open(x.name, 'r') as y: assert int(y.read()) > 2
def testConcurrencyDynamic(self): """ Asserts that promised core resources are allocated properly using a dynamic Toil workflow """ for coresPerJob in self.allocatedCores: log.debug('Testing %d cores per job with CPU count %d', coresPerJob, self.cpuCount) tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job.wrapJobFn(maxConcurrency, self.cpuCount, counterPath, coresPerJob, cores=1, memory='1M', disk='1M') values = Job.Runner.startToil(root, self.getOptions(tempDir)) maxValue = max(values) self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
def root(rootJob): def nullFile(): return rootJob.fileStore.jobStore.importFile( 'file:///dev/null') startFile = nullFile() endFile = nullFile() rootJob.addChildJobFn(deferring, startFile, endFile) encapsulatedJob = Job.wrapJobFn(encapsulated, startFile) encapsulatedJob.addChildFn(dummy) encapsulatedJob.addChildFn(dummy) encapsulatingJob = encapsulatedJob.encapsulate() rootJob.addChild(encapsulatingJob) encapsulatingJob.addChildJobFn(last, endFile)
def userScript(): from toil.common import Toil from toil.job import Job def root(rootJob): def nullFile(): return rootJob.fileStore.jobStore.import_file( 'file:///dev/null') startFile = nullFile() endFile = nullFile() rootJob.addChildJobFn(deferring, startFile, endFile) encapsulatedJob = Job.wrapJobFn(encapsulated, startFile) encapsulatedJob.addChildFn(dummy) encapsulatedJob.addChildFn(dummy) encapsulatingJob = encapsulatedJob.encapsulate() rootJob.addChild(encapsulatingJob) encapsulatingJob.addChildJobFn(last, endFile) def dummy(): pass def deferred(): pass # noinspection PyUnusedLocal def deferring(job, startFile, endFile): job.defer(deferred) job.fileStore.jobStore.delete_file(startFile) timeout = time.time() + 10 while job.fileStore.jobStore.file_exists(endFile): assert time.time() < timeout time.sleep(1) def encapsulated(job, startFile): timeout = time.time() + 10 while job.fileStore.jobStore.file_exists(startFile): assert time.time() < timeout time.sleep(1) def last(job, endFile): job.fileStore.jobStore.delete_file(endFile) if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args( ) with Toil(options) as toil: rootJob = Job.wrapJobFn(root) toil.start(rootJob)
def userScript(): from toil.job import Job from toil.common import Toil # noinspection PyUnusedLocal def job(job, disk='10M', cores=1, memory='10M'): assert False if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: toil.restart() else: toil.start(Job.wrapJobFn(job))
def restartScript(): from toil.job import Job import argparse import os def f0(job): if 'FAIL' in os.environ: raise RuntimeError('failed on purpose') if __name__ == '__main__': parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') Job.Runner.startToil(rootJob, options)
def testPromisesWithJobStoreFileObjects(self, caching=True): """ Check whether FileID objects are being pickled properly when used as return values of functions. Then ensure that lambdas of promised FileID objects can be used to describe the requirements of a subsequent job. This type of operation will be used commonly in Toil scripts. :return: None """ file1 = 1024 file2 = 512 F1 = Job.wrapJobFn(_writer, file1) F2 = Job.wrapJobFn(_writer, file2) G = Job.wrapJobFn(_follower, file1 + file2, disk=PromisedRequirement( lambda x, y: x.size + y.size, F1.rv(), F2.rv())) F1.addChild(F2) F2.addChild(G) Job.Runner.startToil( F1, self.getOptions(self._createTempDir('testFiles'), caching=caching))
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences = [], maskAlpha=False, clipAlpha=None, maskPAF=None, inputEventNames=None, brnnCores=None): #Replace any constants configNode = ET.parse(configFile).getroot() if not outputSequences: outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir) else: assert len(outputSequences) == len(inputSequences) # Make sure we have the dna-brnn model in the filestore if we need it loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha = maskAlpha) if configNode.find("constants") != None: ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() if maskAlpha or clipAlpha: ConfigWrapper(configNode).setPreprocessorActive("lastzRepeatMask", False) ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True) for node in configNode.findall("preprocessor"): if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn': if clipAlpha: node.attrib["action"] = "clip" if brnnCores is not None: for node in configNode.findall("preprocessor"): if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn': node.attrib["cpu"] = brnnCores if not restart: inputSequenceIDs = [] for seq in inputSequences: logger.info("Importing {}".format(seq)) inputSequenceIDs.append(toil.importFile(makeURL(seq))) if maskPAF: inputPAFID = toil.importFile(makeURL(maskPAF)) else: inputPAFID = None unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences, inputSequenceIDs, inputEventNames, maskPAF, inputPAFID) outputSequenceIDs = toil.start(unzip_job) else: outputSequenceIDs = toil.restart() for seqID, path in zip(outputSequenceIDs, outputSequences): try: iter(seqID) # dna-brnn will output a couple of bed files. we scrape those out here toil.exportFile(seqID[0], makeURL(path)) toil.exportFile(seqID[1], makeURL(path) + '.bed') toil.exportFile(seqID[2], makeURL(path) + '.mask.bed') except: toil.exportFile(seqID, makeURL(path))
def _runAndReturnWorkDir(self, cleanWorkDir, job, expectError=False): """ Runs toil with the specified job and cleanWorkDir setting. expectError determines whether the test's toil run is expected to succeed, and the test will fail if that expectation is not met. returns the contents of the workDir after completion of the run """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = self.testDir options.clean = "always" options.cleanWorkDir = cleanWorkDir A = Job.wrapJobFn(job) if expectError: self._launchError(A, options) else: self._launchRegular(A, options) return os.listdir(self.testDir)
def testReturnFileSizesWithBadWorker(self): """ Write a couple of files to the jobstore. Delete a couple of them. Read back written and locally deleted files. Ensure that after every step that the cache state file is describing the correct values. """ self.options.retryCount = 20 self.options.badWorker = 0.5 self.options.badWorkerFailInterval = 0.1 workdir = self._createTempDir(purpose='nonLocalDir') F = Job.wrapJobFn(self._returnFileTestFn, jobDisk=2 * 1024 * 1024 * 1024, initialCachedSize=0, nonLocalDir=workdir, numIters=30, disk='2G') Job.Runner.startToil(F, self.options)
def testImportLinking(self): """ importFile will link instead of copying to jobStore in ``--linkImports`` option is specified. we want to test this behavior """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.linkImports = True fileName = 'dummyFile.txt' with open(fileName, 'w') as fh: fh.write('Subtle literature reference.') with Toil(options) as workflow: fileID = workflow.importFile('file://' + os.path.abspath(fileName)) workflow.start( Job.wrapJobFn(compareiNodes, fileID, os.path.abspath(fileName))) os.remove(fileName)
def testDockerPipeChain(self, caching=True): """ Test for piping API for dockerCall(). Using this API (activated when list of argument lists is given as parameters), commands a piped together into a chain ex: parameters=[ ['printf', 'x\n y\n'], ['wc', '-l'] ] should execute: printf 'x\n y\n' | wc -l """ options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore')) options.logLevel = 'INFO' options.workDir = self.tempDir options.clean = 'always' if not caching: options.disableCaching = True A = Job.wrapJobFn(_testDockerPipeChainFn) rv = Job.Runner.startToil(A, options) assert rv.strip() == '2'
def testDockerPipeChainErrorDetection(self, disableCaching=True): """ By default, executing cmd1 | cmd2 | ... | cmdN, will only return an error if cmdN fails. This can lead to all manor of errors being silently missed. This tests to make sure that the piping API for dockerCall() throws an exception if non-last commands in the chain fail. """ options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, 'jobstore')) options.logLevel = self.dockerTestLogLevel options.workDir = self.tempDir options.clean = 'always' options.caching = disableCaching A = Job.wrapJobFn(_testDockerPipeChainErrorFn) rv = Job.Runner.startToil(A, options) assert rv == True
def main(): """ This is the main function """ parser = argparse.ArgumentParser() parser.add_argument('--sample_groups', '-S', dest='sample_groups', help='sample_groups.dill', type=str, required=True) parser.add_argument('--creds', '-C', dest='creds', help='GDC token file.', type=str, required=True) parser.add_argument('--output_folder', '-O', dest='output_folder', help='Output folder.', type=str, required=False, default='outputs') parser.add_argument('--input_folder', '-I', dest='input_folder', help='Input folder.', type=str, required=False, default=os.getcwd()) Job.Runner.addToilOptions(parser) params = parser.parse_args() params.sample_groups = os.path.abspath(params.sample_groups) params.creds = os.path.abspath(params.creds) params.output_folder = os.path.abspath(params.output_folder) params.input_folder = os.path.abspath(params.input_folder) start = Job.wrapJobFn(launchpad, params.sample_groups, params.input_folder, params.output_folder, params.creds, cores=1) Job.Runner.startToil(start, params) return None
def main(): """ This is a Toil pipeline to transfer TCGA data into an S3 Bucket Data is pulled down with Genetorrent and transferred to S3 via S3AM. """ # Define Parser object and add to toil def existing_file(fname): """ Argparse type for an existing file """ if not os.path.isfile(fname): raise ValueError("Invalid file: " + str(fname)) return fname parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '--sudo', dest='sudo', default=None, action='store_true', help= 'Docker usually needs sudo to execute locally, but not when running Mesos or when ' 'the user is a member of a Docker group.') Job.Runner.addToilOptions(parser) parser.add_argument('datafiles', nargs='+', help='FASTA input', type=existing_file) args = parser.parse_args() assert args.jobStore is not None config = Config() config.setOptions(args) # Store inputs from argparse inputs = {'sudo': args.sudo} datafiles = [os.path.abspath(d) for d in args.datafiles] # Start Pipeline options = Job.Runner.getDefaultOptions("./toilWorkflow") Job.Runner.startToil(Job.wrapJobFn(start_batch, datafiles, inputs), options)
def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--minSleep', type=int, default=1, help="Minimum seconds to sleep") Job.Runner.addToilOptions(parser) options = parser.parse_args(sys.argv[1:]) root_job = Job.wrapJobFn(root, options) with Toil(options) as toil: results = toil.start(root_job) print("Caching results:") print(results)
def testSiblingDAGConsistency(self): """ Slightly more complex case. The stranded job's predecessors are siblings instead of parent/child. """ options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore') options.clean = 'always' options.logLevel = 'debug' i = Job.wrapJobFn(diamond) with Toil(options) as toil: try: toil.start(i) except FailedJobsException: # we expect this exception to be raised pass else: self.fail()
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ options = parse_args(args) # This holds the nicely-parsed options object # Set up logging logging.basicConfig(level=logging.INFO) logging.info("Running on Toil from {}".format(toil.__file__)) # Add the drunner to the options and initialize stuff from the Toil VG # config toilvgfacade.initialize(options) # Start up Toil with Toil(options) as toil_instance: if toil_instance.options.restart: # We're re-running. Grab the root job return value from restart directory = toil_instance.restart() else: # Run from the top # Don't import on the master. Let the nodes handle the download. # Make a root job root_job = Job.wrapJobFn(main_job, options, options.sam_url, cores=1, memory="1G", disk="1G") # Run the root job and get the final output directory directory = toil_instance.start(root_job) # Export the results directory.export_to(lambda id, url: toil_instance.exportFile(id, url), options.out_url) print("Toil workflow complete") return 0
def main(args): options = parse_args(args) RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(run_and_evaluate, options, cores=1, memory="2G", disk="2G") # Run it and get the return value answer = Job.Runner.startToil(root_job, options) RealTimeLogger.stop_master() print("Root return value:") print(answer)
def testPromisedRequirementDynamic(self): """ Asserts that promised core resources are allocated properly using a dynamic Toil workflow """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job.wrapJobFn(maxConcurrency, self.cpuCount, counterPath, coresPerJob, cores=1, memory='1M', disk='1M') values = Job.Runner.startToil(root, self.getOptions(tempDir)) maxValue = max(values) self.assertEqual(maxValue, self.cpuCount / coresPerJob)
def wordCount(self, badWorker=0.0, badWorkerFailInterval=0.05, checkpoint = True): # make workdir workDir = tempfile.mkdtemp() os.rmdir(workDir) # wrap _count as a job countJob = Job.wrapJobFn(_count, 1, checkpoint = checkpoint) options = Job.Runner.getDefaultOptions(workDir) options.batchSystem = 'singleMachine' options.badWorker = badWorker options.badWorkerFailInterval = badWorkerFailInterval options.clean = 'never' Job.Runner.startToil(countJob, options)
def testService(self, checkpoint=False): """ Tests the creation of a Job.Service with random failures of the worker. """ for test in range(2): outFile = getTempFile(rootDir=self._createTempDir()) # Temporary file messageInt = random.randint(1, sys.maxsize) try: # Wire up the services/jobs t = Job.wrapJobFn(serviceTest, outFile, messageInt, checkpoint=checkpoint) # Run the workflow repeatedly until success self.runToil(t) # Check output self.assertEqual(int(open(outFile, 'r').readline()), messageInt) finally: os.remove(outFile)
def main(): """ This is the main function for ProTECT. """ parser = argparse.ArgumentParser(prog='ProTECT', description='Prediction of T-Cell Epitopes for Cancer Therapy', epilog='Contact Arjun Rao ([email protected]) if you encounter ' 'any problems while running ProTECT') inputs = parser.add_mutually_exclusive_group(required=True) inputs.add_argument('--config_file', dest='config_file', help='Config file to be used in the ' 'run.', type=str, default=None) inputs.add_argument('--generate_config', dest='generate_config', help='Generate a config file ' 'in the current directory that is pre-filled with references and flags for ' 'an hg19 run.', action='store_true', default=False) parser.add_argument('--max-cores-per-job', dest='max_cores', help='Maximum cores to use per ' 'job. Aligners and Haplotypers ask for cores dependent on the machine that ' 'the launchpad gets assigned to -- In a heterogeneous cluster, this can ' 'lead to problems. This value should be set to the number of cpus on the ' 'smallest node in a cluster.', type=int, required=False, default=None) # We parse the args once to see if the user has asked for a config file to be generated. In # this case, we don't need a jobstore. To handle the case where Toil arguments are passed to # ProTECT, we parse known args, and if the used specified config_file instead of generate_config # we re-parse the arguments with the added Toil parser. params, others = parser.parse_known_args() if params.generate_config: generate_config_file() else: Job.Runner.addToilOptions(parser) params = parser.parse_args() params.config_file = os.path.abspath(params.config_file) if params.maxCores: if not params.max_cores: params.max_cores = int(params.maxCores) else: if params.max_cores > int(params.maxCores): print("The value provided to max-cores-per-job (%s) was greater than that " "provided to maxCores (%s). Setting max-cores-per-job = maxCores." % (params.max_cores, params.maxCores), file=sys.stderr) params.max_cores = int(params.maxCores) start = Job.wrapJobFn(parse_config_file, params.config_file, params.max_cores) Job.Runner.startToil(start, params) return None
def testDockerPipeChain(self, disableCaching=True): """ Test for piping API for dockerCall(). Using this API (activated when list of argument lists is given as parameters), commands a piped together into a chain. ex: parameters=[ ['printf', 'x\n y\n'], ['wc', '-l'] ] should execute: printf 'x\n y\n' | wc -l """ options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, 'jobstore')) options.logLevel = self.dockerTestLogLevel options.workDir = self.tempDir options.clean = 'always' options.caching = disableCaching A = Job.wrapJobFn(_testDockerPipeChainFn) rv = Job.Runner.startToil(A, options) logger.info('Container pipeline result: %s', repr(rv)) rv = rv.decode('utf-8') assert rv.strip() == '2'
def userScript(): from toil.job import Job from toil.common import Toil # noinspection PyUnresolvedReferences from toil_lib.foo import libraryJob # noinspection PyUnusedLocal def job(job, disk='10M', cores=1, memory='10M'): # Double the requirements to prevent chaining as chaining might hide problems # in hot deployment code. job.addChildJobFn(libraryJob, disk='20M', cores=cores, memory=memory) if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: toil.restart() else: toil.start(Job.wrapJobFn(job))
def chaining(args, toil_options): """entry point to this program""" with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit), args.query_two_bit) target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f) for genome, f in args.target_two_bits.iteritems()} input_file_ids.target_two_bits = target_two_bit_file_ids job = Job.wrapJobFn(setup, args, input_file_ids) chain_file_ids = t.start(job) else: chain_file_ids = t.restart() for chain_file, chain_file_id in chain_file_ids.iteritems(): tools.fileOps.ensure_file_dir(chain_file) t.exportFile(chain_file_id, 'file://' + chain_file)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--num-lines', default=1000, help='Number of lines in file to sort.', type=int) parser.add_argument('--line-length', default=50, help='Length of lines in file to sort.', type=int) parser.add_argument( "--N", help= "The threshold below which a serial sort function is used to sort file. " "All lines must of length less than or equal to N or program will fail", default=10000) options = parser.parse_args() if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) file_name = 'file_to_sort.txt' make_file_to_sort(file_name=file_name, lines=options.num_lines, line_length=options.line_length) with Toil(options) as toil: if not toil.options.restart: sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt') sort_file_id = toil.importFile(sort_file_url) sorted_file_id = toil.start( Job.wrapJobFn(setup, sort_file_id, int(options.N), False, memory='1000M')) toil.exportFile(sorted_file_id, sort_file_url) else: toil.restart()