def test_bam_quickcheck(self): from toil_lib.validators import bam_quickcheck good_bam_url = 's3://cgl-pipeline-inputs/exome/ci/chr6.normal.bam' bad_bam_url = 's3://cgl-pipeline-inputs/exome/ci/truncated.bam' good_bam_job = Job.wrapJobFn(download_url, url=good_bam_url, name='good.bam', work_dir=self.tmpdir) with Toil(self.options) as toil: toil.start(good_bam_job) bad_bam_job = Job.wrapJobFn(download_url, url=bad_bam_url, name='bad.bam', work_dir=self.tmpdir) with Toil(self.options) as toil: toil.start(bad_bam_job) assert bam_quickcheck(os.path.join(self.tmpdir, 'good.bam')) assert bam_quickcheck(os.path.join(self.tmpdir, 'bad.bam')) == False
def runCactusBlast(sequenceFiles, alignmentsFile, toilDir, chunkSize=None, overlapSize=None, logLevel=None, compressFiles=None, lastzMemory=None, targetSequenceFiles=None): options = Job.Runner.getDefaultOptions(toilDir) options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: seqIDs = [ toil.importFile(makeURL(seqFile)) for seqFile in sequenceFiles ] if targetSequenceFiles: targetSeqIDs = [ toil.importFile(makeURL(seqFile)) for seqFile in targetSequenceFiles ] rootJob = BlastSequencesAgainstEachOther( sequenceFileIDs1=seqIDs, sequenceFileIDs2=targetSeqIDs, blastOptions=blastOptions) else: rootJob = BlastSequencesAllAgainstAll(seqIDs, blastOptions) alignmentsID = toil.start(rootJob) toil.exportFile(alignmentsID, makeURL(alignmentsFile))
def _importExportFile(self, options, fail): with Toil(options) as toil: if not options.restart: srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4()) with open(srcFile, 'w') as f: f.write('Hello') inputFileID = toil.importFile('file://' + srcFile) # Write a boolean that determines whether the job fails. with toil._jobStore.writeFileStream() as (f, failFileID): self.failFileID = failFileID f.write(str(fail)) outputFileID = toil.start( HelloWorld(inputFileID, self.failFileID)) else: # Set up job for failure with toil._jobStore.updateFileStream(self.failFileID) as f: f.write('False') outputFileID = toil.restart() toil.exportFile(outputFileID, 'file://' + self.dstFile) with open(self.dstFile, 'r') as f: assert f.read() == "HelloWorld!"
def start_toil(dataset_name, options, use_data=False): if use_data: data = Job.wrapJobFn(download_data.start_toil).encapsulate() mmdb2pdb = data.addFollowOnJobFn( convert_mmdb_to_pdb.start_toil).encapsulate() else: mmdb2pdb = Job.wrapJobFn(convert_mmdb_to_pdb.start_toil).encapsulate() interactome = mmdb2pdb.addChildJobFn(get_structural_interactome.start_toil, dataset_name).encapsulate() bsa = interactome.addFollowOnJobFn(calculate_bsa.start_toil, dataset_name).encapsulate() prep_protein = mmdb2pdb.addChildJobFn(prepare_protein.start_toil, dataset_name).encapsulate() features = mmdb2pdb.addFollowOnJobFn(calculate_features.start_toil, dataset_name, name="features").encapsulate() filter = mmdb2pdb.addFollowOnJobFn(filter_dataset.start_toil, dataset_name, name="filter").encapsulate() with Toil(options) as toil: toil.start(mmdb2pdb if not use_data else data)
def testImportExportFilePermissions(self): """ Ensures that uploaded files preserve their file permissions when they are downloaded again. This function checks that an imported executable file maintains its executability after being exported. """ options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) with Toil(options) as toil: for executable in True, False: srcFile = '%s/%s%s' % (self._tempDir, 'in', str(uuid.uuid4())) with open(srcFile, 'w') as f: f.write('Hello') if executable: # Add file owner execute permissions os.chmod(srcFile, os.stat(srcFile).st_mode | stat.S_IXUSR) # Current file owner execute permissions initialPermissions = os.stat(srcFile).st_mode & stat.S_IXUSR fileID = toil.importFile('file://' + srcFile) toil.exportFile(fileID, 'file://' + self.dstFile) currentPermissions = os.stat( self.dstFile).st_mode & stat.S_IXUSR assert initialPermissions == currentPermissions
def main(options=None): if not options: parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument('--pathGPX', help='The absolute path where all GPX file are.') parser.add_argument( '--pathNPY', help= 'A npy file (np.array stored) with the path of each file to be analysed' ) # parser.add_argument("--O", help="Output destination path ",default=defaultOutPath) options = parser.parse_args() #some checks npyFile = options.pathNPY if not os.path.exists(npyFile): print("the npy file [fileNameGPX1,fileNameGPX2,...] does not exists.") exit() try: codes = np.load(npyFile) if len(codes) == 0: raise RuntimeError("Invalid values of npy file: %s" % options.pathNPY) except: raise RuntimeError("Invalid format of npy file: %s" % options.pathNPY) #Run workflow with Toil(options) as workflow: if not workflow.options.restart: workflow.start(Job.wrapJobFn(generateNetwork, options=options)) else: workflow.restart()
def run_toil(options): """Toil implementation for cgpCaveman.""" setup = StepRunner(process="setup", options=options) split = Split(options=options) remove = RemoveContigs(options=options) concat = StepRunner(process="split_concat", options=options) mstep = SplitRunner(process="mstep", options=options) merge = StepRunner(process="merge", options=options) estep = SplitRunner(process="estep", options=options) results = StepRunner(process="merge_results", options=options) add_ids = StepRunner(process="add_ids", options=options) flag = StepRunner(process="flag", options=options, runtime=None) # build dag setup.addFollowOn(split) split.addFollowOn(remove) remove.addFollowOn(concat) concat.addFollowOn(mstep) mstep.addFollowOn(merge) merge.addFollowOn(estep) estep.addFollowOn(results) results.addFollowOn(add_ids) add_ids.addFollowOn(flag) with Toil(options) as pipe: if not pipe.options.restart: pipe.start(setup) else: pipe.restart()
def main(): # Establish session session = boto3.session.Session() s3 = session.resource('s3') # Grab objects from upload bucket to not download duplicates upload_bucket_name = 'jvivian-ccle-data' upload_bucket = s3.Bucket(upload_bucket_name) processed_keys = set([obj.key for obj in upload_bucket.objects.all()]) # Collect all keys to be processed download_bucket_name = 'cgl-ccle-data' download_bucket = s3.Bucket(download_bucket_name) keys = [ x.key for x in download_bucket.objects.all() if x not in processed_keys ] keys = [ x for x in keys if not x.startswith('output') and x.endswith('.tar.gz') ] # Start Toil run parser = Job.Runner.getDefaultArgumentParser() options = parser.parse_args() with Toil(options) as toil: if not toil.options.restart: toil.start( Job.wrapJobFn(map_job, workflow, keys, download_bucket_name, upload_bucket_name)) else: toil.restart()
def runCactusBlastIngroupsAndOutgroups(ingroups, outgroups, alignmentsFile, toilDir, outgroupFragmentPaths=None, ingroupCoveragePaths=None, chunkSize=None, overlapSize=None, logLevel=None, compressFiles=None, lastzMemory=None): options = Job.Runner.getDefaultOptions(toilDir) options.disableCaching = True options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: ingroupIDs = [toil.importFile(makeURL(ingroup)) for ingroup in ingroups] outgroupIDs = [toil.importFile(makeURL(outgroup)) for outgroup in outgroups] rootJob = BlastIngroupsAndOutgroups(blastOptions, ingroupIDs, outgroupIDs) blastResults = toil.start(rootJob) alignmentsID = blastResults[0] toil.exportFile(alignmentsID, makeURL(alignmentsFile)) outgroupFragmentIDs = blastResults[1] ingroupCoverageIDs = blastResults[2] if outgroupFragmentPaths: assert len(outgroupFragmentIDs) == len(outgroupFragmentPaths) for outgroupFragmentID, outgroupFragmentPath in zip(outgroupFragmentIDs, outgroupFragmentPaths): toil.exportFile(outgroupFragmentID, makeURL(outgroupFragmentPath)) if ingroupCoveragePaths: assert len(ingroupCoverageIDs) == len(ingroupCoveragePaths) for ingroupCoverageID, ingroupCoveragePath in zip(ingroupCoverageIDs, ingroupCoveragePaths): toil.exportFile(ingroupCoverageID, makeURL(ingroupCoveragePath))
def main(): # if I wanted to make this into a true command line tool, I'd fill out the parser. # Instead, I'm just going to add the bare minimum for making a workflow. parser = ArgumentParser() Job.Runner.addToilOptions(parser) # parser.add_argument( # 'output_dir', help='The dir to save the output, target bedfiles.', type=str) options = parser.parse_args() assembly_dir = "./asms/" output_dir = "./liftovers/" assembly_files = {"HG03098_paf_chr21": assembly_dir + "HG03098_paf_chr21.fa", "HG03492_paf_chr21": assembly_dir + "HG03492_paf_chr21.fa", "hg38_chr21": assembly_dir + "hg38_chr21.fa"} hal_file = "ref_based_small_chr21.hal" with Toil(options) as workflow: if not workflow.options.restart: #importing files: for asm, asm_file in assembly_files.items(): assembly_files[asm] = workflow.importFile("file://" + os.path.abspath(asm_file)) hal_file = workflow.importFile("file://" + os.path.abspath(hal_file)) #todo: update here, for running not in cactus_connectivity, need assembly_lengths, possibly other things. liftovers = workflow.start(Job.wrapJobFn(all_to_all_liftovers, assembly_files, hal_file, output_dir)) # liftovers = workflow.start(Job.wrapJobFn(all_to_all_liftovers, assembly_files, hal_file, output_dir, cores=3)) for target_asm, liftovers_dict in liftovers.items(): for source_asm, liftover_file in liftovers_dict.items(): workflow.exportFile(liftover_file, 'file://' + os.path.abspath(output_dir) + "/" + source_asm + "_source_" + target_asm + "_target_liftover.bed") else: output = workflow.restart()
def start_toil(option, use_data=False): from molmimic.generate_data import download_data from molmimic.generate_data import convert_mmdb_to_pdb from molmimic.generate_data import get_structural_interactome from molmimic.generate_data import calculate_bsa from molmimic.generate_data import prepare_protein from molmimic.generate_data import calculate_features from molmimic.generate_data import filter_dataset if use_data: data = Job.wrapJobFn(download_data.start_toil).encapsulate() mmdb2pdb = data.addFollowOnJobFn(convert_mmdb_to_pdb.start_toil).encapsulate() else: mmdb2pdb = Job.wrapJobFn(convert_mmdb_to_pdb.start_toil).encapsulate() interactome = mmdb2pdb.addChildJobFn(get_structural_interactome.start_toil).encapsulate() bsa = interactome.addFollowOnJobFn(calculate_bsa.start_toil).encapsulate() prep_protein = mmdb2pdb.addChildJobFn(prepare_protein.start_toil).encapsulate() features = mmdb2pdb.addFollowOnJobFn(calculate_features.start_toil).encapsulate() filter = mmdb2pdb.addFollowOnJobFn(filter_dataset.start_toil).encapsulate() with Toil(options) as toil: toil.start(mmdb2pdb if not use_data else data)
def runCactusPreprocessor(outputSequenceDir, configFile, inputSequences, toilDir): toilOptions = Job.Runner.getDefaultOptions(toilDir) toilOptions.logLevel = "INFO" toilOptions.disableCaching = True with Toil(toilOptions) as toil: stageWorkflow(outputSequenceDir, configFile, inputSequences, toil)
def main(): opts = parse_args() with Toil(opts) as toil: if opts.restart: outfile_ids, fasta_ids, basenames = toil.restart() else: input_ids = [] input_types = [] input_basenames = [] for input_sequence in opts.input_sequences: input_sequence_id = toil.importFile(makeURL(input_sequence)) if input_sequence.endswith(".gz") or input_sequence.endswith( ".gzip"): type = "gzip" else: type = "fasta" input_ids.append(input_sequence_id) input_types.append(type) basename = os.path.basename(input_sequence) if basename in input_basenames: raise RuntimeError("Inputs must have unique filenames.") input_basenames.append(basename) outfile_ids, fasta_ids, basenames = toil.start( Job.wrapJobFn(launch_parallel, input_ids, input_types, input_basenames, opts)) for outfile_id, fasta_id, basename in zip(outfile_ids, fasta_ids, basenames): toil.exportFile( fasta_id, makeURL(os.path.join(opts.output_path, basename + '.masked'))) toil.exportFile( outfile_id, makeURL(os.path.join(opts.output_path, basename + '.out')))
def align_transcripts(args, toil_options): """ Main entry function for transcript alignment toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, input_file_ids.ref_db] for mode in args.transcript_modes: input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.iteritems(): tools.fileOps.ensure_file_dir(file_path) t.exportFile(file_id, 'file://' + file_path)
def _importExportFile(self, options, fail): with Toil(options) as toil: if not options.restart: srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4()) with open(srcFile, 'w') as f: f.write('Hello') inputFileID = toil.importFile('file://' + srcFile) # Make sure that importFile returns the fileID wrapper self.assertIsInstance(inputFileID, FileID) self.assertEqual(os.stat(srcFile).st_size, inputFileID.size) # Write a boolean that determines whether the job fails. with toil._jobStore.writeFileStream() as (f, failFileID): self.failFileID = failFileID f.write(str(fail).encode('utf-8')) outputFileID = toil.start( RestartingJob(inputFileID, self.failFileID)) else: # Set up job for failure with toil._jobStore.updateFileStream(self.failFileID) as f: f.write('False'.encode('utf-8')) outputFileID = toil.restart() toil.exportFile(outputFileID, 'file://' + self.dstFile) with open(self.dstFile, 'r') as f: assert f.read() == "HelloWorld!"
def _importExportFile(self, options, fail): with Toil(options) as toil: if not options.restart: srcFile = '%s/%s%s' % (self._tempDir, 'in', str(uuid.uuid4())) with open(srcFile, 'w') as f: f.write('Hello') inputFileID = toil.importFile('file://' + srcFile) # Make sure that importFile returns the fileID wrapper self.assertIsInstance(inputFileID, FileID) self.assertEqual(os.stat(srcFile).st_size, inputFileID.size) # Write a boolean that determines whether the job fails. failFilePath = '%s/%s%s' % (self._tempDir, 'failfile', str(uuid.uuid4())) with open(failFilePath, 'wb') as f: f.write(str(fail).encode('utf-8')) self.failFileID = toil.importFile('file://' + failFilePath) outputFileID = toil.start(RestartingJob(inputFileID, self.failFileID)) else: # Set up job for failure # TODO: We're hackily updating this file without using the # correct FileStore interface. User code should not do this! with toil._jobStore.updateFileStream(self.failFileID) as f: f.write('False'.encode('utf-8')) outputFileID = toil.restart() toil.exportFile(outputFileID, 'file://' + self.dstFile) with open(self.dstFile, 'r') as f: assert f.read() == "HelloWorld!"
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) opts = parse_args() opts.hubDir = os.path.abspath(opts.hubDir) opts.hals = [os.path.abspath(hal) for hal in opts.hals] if opts.batchSystem != 'singleMachine': raise RuntimeError("singleMachine is the only supported batchSystem") # Create labels for the HALs if none were provided if opts.labels is None: opts.labels = [os.path.basename(hal) for hal in opts.hals] if len(opts.labels) != len(opts.hals): raise ValueError( "%d labels were provided, but %d hals were provided." % (len(opts.labels), len(opts.hals))) # Ensure that the hals have some genomes in common, and take the # common genomes to display in the hub. genomess = [getGenomesInHal(hal) for hal in opts.hals] genomes = reduce(lambda a, i: a.intersection(i), genomess) if len(genomes) == 0: raise ValueError("No genomes in common between the HALs.") with Toil(opts) as toil: toil.start(Job.wrapJobFn(createHub, genomes, opts))
def main(): # if I wanted to make this into a true command line tool, I'd fill out the parser. # Instead, I'm just going to add the bare minimum for making a workflow. parser = ArgumentParser() Job.Runner.addToilOptions(parser) # parser.add_argument( # 'output_dir', help='The dir to save the output, target bedfiles.', type=str) options = parser.parse_args() with Toil(options) as workflow: if not workflow.options.restart: # #importing files: # for asm, asm_file in assembly_files.items(): # assembly_files[asm] = workflow.importFile("file://" + os.path.abspath(asm_file)) # hal_file = workflow.importFile("file://" + os.path.abspath(hal_file)) liftovers = workflow.start( Job.wrapJobFn(calculate_bases_unmapped, assembly_files, hal_file, output_dir)) for target_asm, liftovers_dict in liftovers.items(): for source_asm, liftover_file in liftovers_dict.items(): workflow.exportFile( liftover_file, 'file://' + os.path.abspath(output_dir) + "/" + source_asm + "_source_" + target_asm + "_target_liftover.bed") else: output = workflow.restart()
def main(): parser = Job.Runner.getDefaultArgumentParser() parser.add_argument('packageFile') parser.add_argument('--outputDir', required=False, default=get_default_outputDir()) parser.add_argument('--dockerDefer', required=False, default='RM') options = parser.parse_args() options.disableCaching = True with Toil(options) as toil: toil.config.dockerDefer = globals()[options.dockerDefer.upper()] toil._jobStore.writeConfig() if not toil.options.restart: storage = start(toil) else: storage = restart(toil) try: os.makedirs(options.outputDir) except OSError as e: if e.errno != errno.EEXIST: raise storage.exportLocalFiles(toil, options.outputDir)
def augustus_pb(args, toil_options): """ Main entry function for AugustusPB toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object :return: """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.chrom_sizes = FileID.forPath( t.importFile('file://' + args.chrom_sizes), args.chrom_sizes) input_file_ids.pb_cfg = FileID.forPath( t.importFile('file://' + args.pb_cfg), args.pb_cfg) input_file_ids.hints_gff = FileID.forPath( t.importFile('file://' + args.hints_gff), args.hints_gff) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G') raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job) else: raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf) t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf) t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf) t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
def runToilPipeline(self, alignmentsFile, alpha=0.001): # Tests the toil pipeline options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, "toil")) options.logLevel = self.logLevelString with Toil(options) as toil: # Import the input file into the job store inputAlignmentFileID = toil.importFile(makeURL(alignmentsFile)) rootJob = Job.wrapJobFn(mappingQualityRescoring, inputAlignmentFileID, minimumMapQValue=0, maxAlignmentsPerSite=1, alpha=alpha, logLevel=self.logLevelString) primaryOutputAlignmentsFileID, secondaryOutputAlignmentsFileID = toil.start( rootJob) toil.exportFile(primaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath)) toil.exportFile(secondaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath2)) # Check output with open(self.simpleOutputCigarPath, 'r') as fh: primaryOutputCigars = [cigar[:-1] for cigar in fh.readlines() ] # Remove new lines with open(self.simpleOutputCigarPath2, 'r') as fh: secondaryOutputCigars = [cigar[:-1] for cigar in fh.readlines() ] # Remove new lines return primaryOutputCigars + secondaryOutputCigars
def userScript(): from toil.job import Job from toil.common import Toil # A user-defined type, i.e. a type defined in the user script class X(object): pass # noinspection PyUnusedLocal def job(job, x, disk='10M', cores=1, memory='10M'): return x if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args( ) x = X() with Toil(options) as toil: r = toil.start(Job.wrapJobFn(job, x).encapsulate()) # Assert that the return value is of type X, but not X from the __main__ # module but X from foo.bar, the canonical name for the user module. The # translation from __main__ to foo.bar is a side effect of hot-deployment. assert r.__class__ is not X import foo.bar assert r.__class__ is foo.bar.X # Assert that a copy was made. This is a side effect of pickling/unpickling. assert x is not r
def augustus(args, coding_gp, toil_options): """ Main entry function for Augustus toil pipeline :param args: dictionary of arguments from CAT :param coding_gp: genePred with only coding transcripts :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.tm_cfg = FileID.forPath(t.importFile('file://' + args.tm_cfg), args.tm_cfg) input_file_ids.coding_gp = FileID.forPath(t.importFile('file://' + coding_gp), coding_gp) input_file_ids.ref_psl = FileID.forPath(t.importFile('file://' + args.ref_psl), args.ref_psl) input_file_ids.tm_psl = FileID.forPath(t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) file_ids = [input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, input_file_ids.tm_psl, input_file_ids.annotation_gp] if args.augustus_tmr: input_file_ids.augustus_hints_db = FileID.forPath(t.importFile('file://' + args.augustus_hints_db), args.augustus_hints_db) input_file_ids.tmr_cfg = FileID.forPath(t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) tm_file_id, tmr_file_id = t.start(job) else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def start(self): with Toil(toil_options) as workflow: input_models = [InputModel(workflow, model) for model in self._models] job = BatchModelfit(input_models) result_models = workflow.start(job) for model in result_models: export_files(workflow, model.modelfit_results.tool_files)
def setUp(self): super(JobWrapperTest, self).setUp() self.jobStorePath = self._getTestJobStorePath() parser = ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args(args=[self.jobStorePath]) self.toil = Toil(options) self.assertEquals(self.toil, self.toil.__enter__())
def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] fasta = pyfasta.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} for dtype in ['BAM', 'INTRONBAM']: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome) # load the IsoSeq data, if we have any iso_seq_file_ids = [] if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), hints_args.annotation_gp) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), hints_args.protein_fasta) genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) input_file_ids = {'bams': bam_file_ids, 'iso_seq_bams': iso_seq_file_ids, 'annotation': annotation_file_id, 'protein_fasta': protein_fasta_file_id, 'genome_fasta': genome_fasta_file_id} if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def test_download_url(self): from toil_lib.urls import download_url A = Job.wrapJobFn(download_url, work_dir=self.tmpdir, url='www.google.com', name='testy') with Toil(self.options) as toil: toil.start(A) assert os.path.exists(os.path.join(self.tmpdir, 'testy'))
def testExportAfterFailedExport(self): options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) try: with Toil(options) as toil: _ = toil.start(HelloWorld()) # oh no, an error! :( raise RuntimeError("we died after workflow completion but before our export finished") except RuntimeError: pass options.restart = True with Toil(options) as toil: fileID = toil.restart() print fileID # Hopefully the error didn't cause us to lose all our work! toil.exportFile(fileID, 'file://' + self.exportPath) with open(self.exportPath) as f: # The file should have all our content self.assertEquals(f.read(), "Hello, World!")
def test_upload_and_download_with_encryption(self): from toil_lib.urls import s3am_upload from toil_lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key # Create temporary encryption key key_path = os.path.join(self.tmpdir, 'foo.key') subprocess.check_call([ 'dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path) ]) # Create test file upload_fpath = os.path.join(self.tmpdir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] A = Job.wrapJobFn(s3am_upload, fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) with Toil(self.options) as toil: toil.start(A) # Download the file B = Job.wrapJobFn(download_url, url=s3_url, name='download_file', work_dir=self.tmpdir, s3_key_path=key_path) with Toil(self.options) as toil: toil.start(B) download_fpath = os.path.join(self.tmpdir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def main(): args = cli() samples = parse_manifest(args.manifest) # Start Toil run with Toil(args) as toil: if not toil.options.restart: toil.start(Job.wrapJobFn(map_job, run_outlier_model, samples, args)) else: toil.restart()