def __init__(self, memory=None, cores=None, disk=None): """ This method must be called by any overiding constructor. Memory is the maximum number of bytes of memory the job will require to run. Cores is the number of CPU cores required. """ self.cores = cores self.memory = human2bytes(str(memory)) if memory is not None else memory self.disk = human2bytes(str(disk)) if disk is not None else disk #Private class variables #See Job.addChild self._children = [] #See Job.addFollowOn self._followOns = [] #See Job.addService self._services = [] #A follow-on, service or child of a job A, is a "successor" of A, if B #is a successor of A, then A is a predecessor of B. self._predecessors = set() # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module # defining the class self is an instance of, which may be a subclass of Job that may be # defined in a different module. self.userModule = ModuleDescriptor.forModule(self.__module__) #See Job.rv() self._rvs = {}
def find_total_disk_usage(input_file_ids, buffer='2G', round='2G'): """ Takes a input_file_id namespace or dict or list and finds all members that are FileID objects, and finds their sizes. Based on buffer and round, returns a integer value of disk usage in bytes to pass to a toil job. :param input_file_ids: A namespace object with an arbitrary nesting of possible file ID values :param buffer: Additional space buffer requested. Human readable parsed by human2bytes :param round: amount to round up. Human readable parsed by human2bytes :return: integer """ def roundup(x, base): return int(math.ceil(x / float(base))) * base def descend_object(obj): if isinstance(obj, dict): for item in obj.values(): for v in descend_object(item): yield v elif isinstance(obj, list): for item in obj: for v in descend_object(item): yield v elif isinstance(obj, argparse.Namespace): for item in obj.__dict__.values(): for v in descend_object(item): yield v elif isinstance(obj, FileID): yield obj tot = sum([x.size for x in descend_object(input_file_ids)]) return roundup(tot, human2bytes(round)) + human2bytes(buffer)
def __init__(self, memory=None, cores=None, disk=None): """ This method must be called by any overiding constructor. Memory is the maximum number of bytes of memory the job will require to run. Cores is the number of CPU cores required. """ self.cores = cores self.memory = human2bytes(str(memory)) if memory is not None else memory self.disk = human2bytes(str(disk)) if disk is not None else disk #Private class variables #See Job.addChild self._children = [] #See Job.addFollowOn self._followOns = [] #See Job.addService self._services = [] #A follow-on, service or child of a job A, is a "direct successor" of A, if B #is a direct successor of A, then A is a "direct predecessor" of B. self._directPredecessors = set() # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module # defining the class self is an instance of, which may be a subclass of Job that may be # defined in a different module. self.userModule = ModuleDescriptor.forModule(self.__module__) #See Job.rv() self._rvs = {} self._promiseJobStore = None
def signalAlignJobFunction(job, config, alignment_shards): alignment_shards = chain(*alignment_shards) # each shard is a region of the genome/chromosome/contig and can be methylation called # independently all_methylation_probs = [ ] # contains the methylation probabilites for all of the shards together count = 0 for aln_shard in alignment_shards: disk = (2 * config["reference_FileStoreID"].size) memory = (6 * aln_shard.FileStoreID.size) batch_disk = human2bytes("250M") + config["reference_FileStoreID"].size methylation_probs = job.addChildJobFn( shardSamJobFunction, config, aln_shard, None, calculateMethylationProbabilityJobFunction, callMethylationJobFunction, exonerateCigarStringFn=exonerateCigarWithStrandOrientation, batch_disk=batch_disk, disk=disk, memory=memory).rv() all_methylation_probs.append(methylation_probs) count += 1 job.fileStore.logToMaster( "[signalAlignJobFunction]Issued methylation calling for %s alignment shards" % count) job.addFollowOnJobFn(consolidateVariantCallsJobFunction, config, all_methylation_probs) return
def makeReadstoreJobFunction(job, config, samples): cores = config["download_cores"] tar_fids = [job.addChildJobFn(prepareFast5Tarfile, human2bytes(config["split_tars_bigger_than_this"]), config["put_this_many_reads_in_a_tar"], # batchsize config["max_download_slots"], config["download_part_size"], sample, cores=cores, disk=(3 * sample.size)).rv() for sample in samples] job.addFollowOnJobFn(makeLedgerJobFunction, config, tar_fids)
def parse_line(line): # double check input, shouldn't need to though require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line)) sample_line = line.strip().split("\t") require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line)) filetype, url, sample_label, size = sample_line # checks: # check filetype require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype)) # check URL require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url)) return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label)
def main(): """ GATK germline pipeline with variant filtering and annotation. """ # Define Parser object and add to jobTree parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) # Generate subparsers subparsers = parser.add_subparsers(dest='command') subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline') parser_run.add_argument('--config', required=True, type=str, help='Path to the (filled in) config file, generated with ' '"generate-config".') parser_run.add_argument('--manifest', type=str, help='Path to the (filled in) manifest file, generated with ' '"generate-manifest".\nDefault value: "%(default)s".') parser_run.add_argument('--sample', default=None, nargs=2, type=str, help='Input sample identifier and BAM file URL or local path') parser_run.add_argument('--output-dir', default=None, help='Path/URL to output directory') parser_run.add_argument('-s', '--suffix', default=None, help='Additional suffix to add to the names of the output files') parser_run.add_argument('--preprocess-only', action='store_true', help='Only runs preprocessing steps') Job.Runner.addToilOptions(parser_run) options = parser.parse_args() cwd = os.getcwd() if options.command == 'generate-config' or options.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config) if options.command == 'generate-manifest' or options.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest) elif options.command == 'run': # Program checks for program in ['curl', 'docker']: require(next(which(program)), program + ' must be installed on every node.'.format(program)) require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config)) # Read sample manifest samples = [] if options.manifest: samples.extend(parse_manifest(options.manifest)) # Add BAM sample from command line if options.sample: uuid, url = options.sample # samples tuple: (uuid, url, paired_url, rg_line) # BAM samples should not have as paired URL or read group line samples.append(GermlineSample(uuid, url, None, None)) require(len(samples) > 0, 'No samples were detected in the manifest or on the command line') # Parse inputs inputs = {x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems()} required_fields = {'genome_fasta', 'output_dir', 'run_bwa', 'sorted', 'snp_filter_annotations', 'indel_filter_annotations', 'preprocess', 'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator', 'cores', 'file_size', 'xmx', 'suffix'} input_fields = set(inputs.keys()) require(input_fields > required_fields, 'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields))) if inputs['output_dir'] is None: inputs['output_dir'] = options.output_dir require(inputs['output_dir'] is not None, 'Missing output directory PATH/URL') if inputs['suffix'] is None: inputs['suffix'] = options.suffix if options.suffix else '' if inputs['preprocess_only'] is None: inputs['preprocess_only'] = options.preprocess_only if inputs['run_vqsr']: # Check that essential VQSR parameters are present vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'} require(input_fields > vqsr_fields, 'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields))) # Check that hard filtering parameters are present. If only running preprocessing steps, then we do # not need filtering information. elif not inputs['preprocess_only']: hard_filter_fields = {'snp_filter_name', 'snp_filter_expression', 'indel_filter_name', 'indel_filter_expression'} require(input_fields > hard_filter_fields, 'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields))) # Check for falsey hard filtering parameters for hard_filter_field in hard_filter_fields: require(inputs[hard_filter_field], 'Missing %s value for hard filtering, ' 'got %s.' % (hard_filter_field, inputs[hard_filter_field])) # Set resource parameters inputs['xmx'] = human2bytes(inputs['xmx']) inputs['file_size'] = human2bytes(inputs['file_size']) inputs['cores'] = int(inputs['cores']) inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations']) # HaplotypeCaller test data for testing inputs['hc_output'] = inputs.get('hc_output', None) # It is a toil-scripts convention to store input parameters in a Namespace object config = argparse.Namespace(**inputs) root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config) Job.Runner.startToil(root, options)
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError("The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x : human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue #Core options setOption("jobStore", parsingFn=lambda x: os.path.abspath(x) if options.jobStore.startswith('.') else x) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("cleanWorkDir") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError("Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Autoscaling options setOption("provisioner") setOption("preemptableNodeOptions") setOption("minPreemptableNodes", int) setOption("maxPreemptableNodes", int) setOption("nodeOptions") setOption("minNodes", int) setOption("maxNodes", int) setOption("alphaPacking", float) setOption("betaInertia", float) setOption("scaleInterval", float) #Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("readGlobalFileMutableByDefault") setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) setOption("defaultPreemptable") #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert(len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) setOption("servicePollingInterval", float, fC(0.0)) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError( "The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x: human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue def parseJobStore(s): name, rest = Toil.parseLocator(s) if name == 'file': # We need to resolve relative paths early, on the leader, because the worker process # may have a different working directory than the leader, e.g. under Mesos. return Toil.buildLocator(name, os.path.abspath(rest)) else: return s #Core options setOption("jobStore", parsingFn=parseJobStore) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("cleanWorkDir") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError( "Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Autoscaling options setOption("provisioner") setOption("nodeType") setOption("nodeOptions") setOption("minNodes", int) setOption("maxNodes", int) setOption("preemptableNodeType") setOption("preemptableNodeOptions") setOption("minPreemptableNodes", int) setOption("maxPreemptableNodes", int) setOption("alphaPacking", float) setOption("betaInertia", float) setOption("scaleInterval", float) setOption("preemptableCompensation", float) require(0.0 <= self.preemptableCompensation <= 1.0, '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0', self.preemptableCompensation) # Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("readGlobalFileMutableByDefault") setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) setOption("defaultPreemptable") #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("disableCaching") setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert (len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) setOption("servicePollingInterval", float, fC(0.0)) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
# See the License for the specific language governing permissions and # limitations under the License. """A demonstration of toil. Sorts the lines of a file into ascending order by doing a parallel merge sort. """ from __future__ import absolute_import from argparse import ArgumentParser import os import random import logging import shutil from bd2k.util.humanize import human2bytes from toil.job import Job from toil.test.sort.lib import merge, sort, copySubRangeOfFile, getMidPoint sortMemory = human2bytes('1000M') def setup(job, inputFile, N, downCheckpoints): """Sets up the sort. """ # insure default resource requirements are being set correctly assert job.cores is not None assert job.disk is not None assert job.preemptable is not None # insure user specified resource requirements are being set correctly assert job.memory is not None #Write the input file to the file store inputFileStoreID = job.fileStore.writeGlobalFile(inputFile, True) job.fileStore.logToMaster(" Starting the merge sort ") job.addFollowOnJobFn(
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName) if x != None: if parsingFn != None: x = parsingFn(x) if checkFn != None: try: checkFn(x) except AssertionError: raise RuntimeError( "The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) h2b = lambda x: human2bytes( str(x) ) #Function to parse integer from string expressed in different formats def iC(minValue, maxValue=sys.maxint): #Returns function to check the a parameter is in a valid range def f(x): assert x >= minValue and x < maxValue return f #Core options setOption("jobStore", parsingFn=lambda x: os.path.abspath(x) if options.jobStore.startswith('.') else x) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("clean") if self.stats: if self.clean != "never": raise RuntimeError( "Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float) setOption("masterIP") setOption("parasolCommand") #Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", h2b, iC(1)) setOption("defaultDisk", h2b, iC(1)) setOption("maxCores", h2b, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert (len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse)
# See the License for the specific language governing permissions and # limitations under the License. """A demonstration of toil. Sorts the lines of a file into ascending order by doing a parallel merge sort. """ from __future__ import absolute_import from argparse import ArgumentParser import os import random from bd2k.util.humanize import human2bytes from toil.job import Job from toil.test.sort.lib import merge, sort, copySubRangeOfFile, getMidPoint success_ratio = 0.5 sortMemory = human2bytes('1000M') def setup(job, inputFile, N): """Sets up the sort. """ job.addFollowOnJobFn(cleanup, job.addChildJobFn(down, inputFile, 0, os.path.getsize(inputFile), N).rv(), inputFile, memory=sortMemory) def down(job, inputFile, fileStart, fileEnd, N): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > success_ratio:
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError( "The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x: human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue #Core options setOption("jobStore", parsingFn=lambda x: os.path.abspath(x) if options.jobStore.startswith('.') else x) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError( "Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("defaultCache", h2b, iC(0)) setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert (len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x != None: if parsingFn != None: x = parsingFn(x) if checkFn != None: try: checkFn(x) except AssertionError: raise RuntimeError("The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) h2b = lambda x : human2bytes(str(x)) #Function to parse integer from string expressed in different formats def iC(minValue, maxValue=sys.maxint): #Returns function to check the a parameter is in a valid range def f(x): assert x >= minValue and x < maxValue return f #Core options setOption("jobStore", parsingFn=lambda x : os.path.abspath(x) if options.jobStore.startswith('.') else x) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError("Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float) setOption("masterIP") setOption("parasolCommand") #Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", h2b, iC(1)) setOption("defaultDisk", h2b, iC(1)) setOption("maxCores", h2b, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert(len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse)
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError("The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x : human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue def parseJobStore(s): name, rest = Toil.parseLocator(s) if name == 'file': # We need to resolve relative paths early, on the leader, because the worker process # may have a different working directory than the leader, e.g. under Mesos. return Toil.buildLocator(name, os.path.abspath(rest)) else: return s #Core options setOption("jobStore", parsingFn=parseJobStore) #TODO: LOG LEVEL STRING setOption("workDir") if self.workDir is not None: self.workDir = os.path.abspath(self.workDir) if not os.path.exists(self.workDir): raise RuntimeError("The path provided to --workDir (%s) does not exist." % self.workDir) setOption("stats") setOption("cleanWorkDir") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError("Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Autoscaling options setOption("provisioner") setOption("nodeType") setOption("nodeOptions") setOption("minNodes", int) setOption("maxNodes", int) setOption("preemptableNodeType") setOption("preemptableNodeOptions") setOption("minPreemptableNodes", int) setOption("maxPreemptableNodes", int) setOption("alphaPacking", float) setOption("betaInertia", float) setOption("scaleInterval", float) setOption("preemptableCompensation", float) require(0.0 <= self.preemptableCompensation <= 1.0, '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0', self.preemptableCompensation) # Parameters to limit service jobs / detect deadlocks setOption("maxServiceJobs", int) setOption("maxPreemptableServiceJobs", int) setOption("deadlockWait", int) # Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("readGlobalFileMutableByDefault") setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) setOption("defaultPreemptable") #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("disableCaching") setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert(len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) setOption("servicePollingInterval", float, fC(0.0)) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
def main(): """ GATK germline pipeline with variant filtering and annotation. """ # Define Parser object and add to jobTree parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawTextHelpFormatter) # Generate subparsers subparsers = parser.add_subparsers(dest='command') subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline') parser_run.add_argument( '--config', required=True, type=str, help='Path to the (filled in) config file, generated with ' '"generate-config".') parser_run.add_argument( '--manifest', type=str, help='Path to the (filled in) manifest file, generated with ' '"generate-manifest".\nDefault value: "%(default)s".') parser_run.add_argument( '--sample', default=None, nargs=2, type=str, help='Input sample identifier and BAM file URL or local path') parser_run.add_argument('--output-dir', default=None, help='Path/URL to output directory') parser_run.add_argument( '-s', '--suffix', default=None, help='Additional suffix to add to the names of the output files') parser_run.add_argument('--preprocess-only', action='store_true', help='Only runs preprocessing steps') Job.Runner.addToilOptions(parser_run) options = parser.parse_args() cwd = os.getcwd() if options.command == 'generate-config' or options.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config) if options.command == 'generate-manifest' or options.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest) elif options.command == 'run': # Program checks for program in ['curl', 'docker']: require( next(which(program)), program + ' must be installed on every node.'.format(program)) require( os.path.exists(options.config), '{} not found. Please run "generate-config"'.format( options.config)) # Read sample manifest samples = [] if options.manifest: samples.extend(parse_manifest(options.manifest)) # Add BAM sample from command line if options.sample: uuid, url = options.sample # samples tuple: (uuid, url, paired_url, rg_line) # BAM samples should not have as paired URL or read group line samples.append(GermlineSample(uuid, url, None, None)) require( len(samples) > 0, 'No samples were detected in the manifest or on the command line') # Parse inputs inputs = { x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems() } required_fields = { 'genome_fasta', 'output_dir', 'run_bwa', 'sorted', 'snp_filter_annotations', 'indel_filter_annotations', 'preprocess', 'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator', 'cores', 'file_size', 'xmx', 'suffix' } input_fields = set(inputs.keys()) require( input_fields > required_fields, 'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields))) if inputs['output_dir'] is None: inputs['output_dir'] = options.output_dir require(inputs['output_dir'] is not None, 'Missing output directory PATH/URL') if inputs['suffix'] is None: inputs['suffix'] = options.suffix if options.suffix else '' if inputs['preprocess_only'] is None: inputs['preprocess_only'] = options.preprocess_only if inputs['run_vqsr']: # Check that essential VQSR parameters are present vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'} require( input_fields > vqsr_fields, 'Missing parameters for VQSR:\n{}'.format( ', '.join(vqsr_fields - input_fields))) # Check that hard filtering parameters are present. If only running preprocessing steps, then we do # not need filtering information. elif not inputs['preprocess_only']: hard_filter_fields = { 'snp_filter_name', 'snp_filter_expression', 'indel_filter_name', 'indel_filter_expression' } require( input_fields > hard_filter_fields, 'Missing parameters for hard filtering:\n{}'.format( ', '.join(hard_filter_fields - input_fields))) # Check for falsey hard filtering parameters for hard_filter_field in hard_filter_fields: require( inputs[hard_filter_field], 'Missing %s value for hard filtering, ' 'got %s.' % (hard_filter_field, inputs[hard_filter_field])) # Set resource parameters inputs['xmx'] = human2bytes(inputs['xmx']) inputs['file_size'] = human2bytes(inputs['file_size']) inputs['cores'] = int(inputs['cores']) inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations']) # HaplotypeCaller test data for testing inputs['hc_output'] = inputs.get('hc_output', None) # It is a toil-scripts convention to store input parameters in a Namespace object config = argparse.Namespace(**inputs) root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config) Job.Runner.startToil(root, options)