Example #1
0
 def __init__(self, memory=None, cores=None, disk=None):
     """
     This method must be called by any overiding constructor.
     
     Memory is the maximum number of bytes of memory the job will
     require to run. Cores is the number of CPU cores required.
     """
     self.cores = cores
     self.memory = human2bytes(str(memory)) if memory is not None else memory
     self.disk = human2bytes(str(disk)) if disk is not None else disk
     #Private class variables
     
     #See Job.addChild
     self._children = []
     #See Job.addFollowOn
     self._followOns = []
     #See Job.addService
     self._services = []
     #A follow-on, service or child of a job A, is a "successor" of A, if B
     #is a successor of A, then A is a predecessor of B. 
     self._predecessors = set()
     # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module
     # defining the class self is an instance of, which may be a subclass of Job that may be
     # defined in a different module.
     self.userModule = ModuleDescriptor.forModule(self.__module__)
     #See Job.rv()
     self._rvs = {}
def find_total_disk_usage(input_file_ids, buffer='2G', round='2G'):
    """
    Takes a input_file_id namespace or dict or list and finds all members that are FileID objects,
    and finds their sizes.
    Based on buffer and round, returns a integer value of disk usage in bytes to pass to a toil job.
    :param input_file_ids: A namespace object with an arbitrary nesting of possible file ID values
    :param buffer: Additional space buffer requested. Human readable parsed by human2bytes
    :param round: amount to round up. Human readable parsed by human2bytes
    :return: integer
    """
    def roundup(x, base):
        return int(math.ceil(x / float(base))) * base

    def descend_object(obj):
        if isinstance(obj, dict):
            for item in obj.values():
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, list):
            for item in obj:
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, argparse.Namespace):
            for item in obj.__dict__.values():
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, FileID):
            yield obj

    tot = sum([x.size for x in descend_object(input_file_ids)])
    return roundup(tot, human2bytes(round)) + human2bytes(buffer)
def find_total_disk_usage(input_file_ids, buffer='2G', round='2G'):
    """
    Takes a input_file_id namespace or dict or list and finds all members that are FileID objects,
    and finds their sizes.
    Based on buffer and round, returns a integer value of disk usage in bytes to pass to a toil job.
    :param input_file_ids: A namespace object with an arbitrary nesting of possible file ID values
    :param buffer: Additional space buffer requested. Human readable parsed by human2bytes
    :param round: amount to round up. Human readable parsed by human2bytes
    :return: integer
    """
    def roundup(x, base):
        return int(math.ceil(x / float(base))) * base

    def descend_object(obj):
        if isinstance(obj, dict):
            for item in obj.values():
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, list):
            for item in obj:
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, argparse.Namespace):
            for item in obj.__dict__.values():
                for v in descend_object(item):
                    yield v
        elif isinstance(obj, FileID):
            yield obj

    tot = sum([x.size for x in descend_object(input_file_ids)])
    return roundup(tot, human2bytes(round)) + human2bytes(buffer)
Example #4
0
    def __init__(self, memory=None, cores=None, disk=None):
        """
        This method must be called by any overiding constructor.
        
        Memory is the maximum number of bytes of memory the job will
        require to run. Cores is the number of CPU cores required.
        """
        self.cores = cores
        self.memory = human2bytes(str(memory)) if memory is not None else memory
        self.disk = human2bytes(str(disk)) if disk is not None else disk
        #Private class variables

        #See Job.addChild
        self._children = []
        #See Job.addFollowOn
        self._followOns = []
        #See Job.addService
        self._services = []
        #A follow-on, service or child of a job A, is a "direct successor" of A, if B
        #is a direct successor of A, then A is a "direct predecessor" of B.
        self._directPredecessors = set()
        # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module
        # defining the class self is an instance of, which may be a subclass of Job that may be
        # defined in a different module.
        self.userModule = ModuleDescriptor.forModule(self.__module__)
        #See Job.rv()
        self._rvs = {}
        self._promiseJobStore = None
Example #5
0
def signalAlignJobFunction(job, config, alignment_shards):
    alignment_shards = chain(*alignment_shards)
    # each shard is a region of the genome/chromosome/contig and can be methylation called
    # independently
    all_methylation_probs = [
    ]  # contains the methylation probabilites for all of the shards together
    count = 0
    for aln_shard in alignment_shards:
        disk = (2 * config["reference_FileStoreID"].size)
        memory = (6 * aln_shard.FileStoreID.size)
        batch_disk = human2bytes("250M") + config["reference_FileStoreID"].size
        methylation_probs = job.addChildJobFn(
            shardSamJobFunction,
            config,
            aln_shard,
            None,
            calculateMethylationProbabilityJobFunction,
            callMethylationJobFunction,
            exonerateCigarStringFn=exonerateCigarWithStrandOrientation,
            batch_disk=batch_disk,
            disk=disk,
            memory=memory).rv()
        all_methylation_probs.append(methylation_probs)
        count += 1
    job.fileStore.logToMaster(
        "[signalAlignJobFunction]Issued methylation calling for %s alignment shards"
        % count)
    job.addFollowOnJobFn(consolidateVariantCallsJobFunction, config,
                         all_methylation_probs)
    return
Example #6
0
def makeReadstoreJobFunction(job, config, samples):
    cores    = config["download_cores"]
    tar_fids = [job.addChildJobFn(prepareFast5Tarfile,
                                  human2bytes(config["split_tars_bigger_than_this"]),
                                  config["put_this_many_reads_in_a_tar"],  # batchsize
                                  config["max_download_slots"],
                                  config["download_part_size"],
                                  sample, cores=cores,
                                  disk=(3 * sample.size)).rv()
                for sample in samples]
    job.addFollowOnJobFn(makeLedgerJobFunction, config, tar_fids)
Example #7
0
    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line))
        sample_line = line.strip().split("\t")
        require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line))
        filetype, url, sample_label, size = sample_line
        # checks:
        # check filetype
        require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype))
        # check URL
        require(urlparse(url).scheme and urlparse(url),
                "Invalid URL passed for {}".format(url))

        return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label)
Example #8
0
def main():
    """
    GATK germline pipeline with variant filtering and annotation.
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)

    # Generate subparsers
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser('generate-config',
                          help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest',
                          help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate',
                          help='Generates a config and manifest in the current working directory.')

    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline')
    parser_run.add_argument('--config',
                            required=True,
                            type=str,
                            help='Path to the (filled in) config file, generated with '
                                 '"generate-config".')
    parser_run.add_argument('--manifest',
                            type=str,
                            help='Path to the (filled in) manifest file, generated with '
                                 '"generate-manifest".\nDefault value: "%(default)s".')
    parser_run.add_argument('--sample',
                            default=None,
                            nargs=2,
                            type=str,
                            help='Input sample identifier and BAM file URL or local path')
    parser_run.add_argument('--output-dir',
                            default=None,
                            help='Path/URL to output directory')
    parser_run.add_argument('-s', '--suffix',
                            default=None,
                            help='Additional suffix to add to the names of the output files')
    parser_run.add_argument('--preprocess-only',
                            action='store_true',
                            help='Only runs preprocessing steps')

    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest)
    elif options.command == 'run':
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program)),
                    program + ' must be installed on every node.'.format(program))

        require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config))

        # Read sample manifest
        samples = []
        if options.manifest:
            samples.extend(parse_manifest(options.manifest))

        # Add BAM sample from command line
        if options.sample:
            uuid, url = options.sample
            # samples tuple: (uuid, url, paired_url, rg_line)
            # BAM samples should not have as paired URL or read group line
            samples.append(GermlineSample(uuid, url, None, None))

        require(len(samples) > 0,
                'No samples were detected in the manifest or on the command line')

        # Parse inputs
        inputs = {x.replace('-', '_'): y for x, y in
                  yaml.load(open(options.config).read()).iteritems()}

        required_fields = {'genome_fasta',
                           'output_dir',
                           'run_bwa',
                           'sorted',
                           'snp_filter_annotations',
                           'indel_filter_annotations',
                           'preprocess',
                           'preprocess_only',
                           'run_vqsr',
                           'joint_genotype',
                           'run_oncotator',
                           'cores',
                           'file_size',
                           'xmx',
                           'suffix'}

        input_fields = set(inputs.keys())
        require(input_fields > required_fields,
                'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields)))

        if inputs['output_dir'] is None:
            inputs['output_dir'] = options.output_dir

        require(inputs['output_dir'] is not None,
                'Missing output directory PATH/URL')

        if inputs['suffix'] is None:
            inputs['suffix'] = options.suffix if options.suffix else ''

        if inputs['preprocess_only'] is None:
            inputs['preprocess_only'] = options.preprocess_only

        if inputs['run_vqsr']:
            # Check that essential VQSR parameters are present
            vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'}
            require(input_fields > vqsr_fields,
                    'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields)))

        # Check that hard filtering parameters are present. If only running preprocessing steps, then we do
        # not need filtering information.
        elif not inputs['preprocess_only']:
            hard_filter_fields = {'snp_filter_name', 'snp_filter_expression',
                                  'indel_filter_name', 'indel_filter_expression'}
            require(input_fields > hard_filter_fields,
                    'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields)))

            # Check for falsey hard filtering parameters
            for hard_filter_field in hard_filter_fields:
                require(inputs[hard_filter_field], 'Missing %s value for hard filtering, '
                                                   'got %s.' % (hard_filter_field, inputs[hard_filter_field]))

        # Set resource parameters
        inputs['xmx'] = human2bytes(inputs['xmx'])
        inputs['file_size'] = human2bytes(inputs['file_size'])
        inputs['cores'] = int(inputs['cores'])

        inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations'])

        # HaplotypeCaller test data for testing
        inputs['hc_output'] = inputs.get('hc_output', None)

        # It is a toil-scripts convention to store input parameters in a Namespace object
        config = argparse.Namespace(**inputs)

        root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config)
        Job.Runner.startToil(root, options)
Example #9
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes #This import is used to convert
        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError("The %s option has an invalid value: %s"
                                           % (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x : human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        #Core options
        setOption("jobStore",
                  parsingFn=lambda x: os.path.abspath(x) if options.jobStore.startswith('.') else x)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        setOption("stats")
        setOption("cleanWorkDir")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError("Contradicting options passed: Clean flag is set to %s "
                                   "despite the stats flag requiring "
                                   "the jobStore to be intact at the end of the run. "
                                   "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Autoscaling options
        setOption("provisioner")
        setOption("preemptableNodeOptions")
        setOption("minPreemptableNodes", int)
        setOption("maxPreemptableNodes", int)
        setOption("nodeOptions")
        setOption("minNodes", int)
        setOption("maxNodes", int)
        setOption("alphaPacking", float)
        setOption("betaInertia", float)
        setOption("scaleInterval", float)

        #Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("readGlobalFileMutableByDefault")
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))
        setOption("defaultPreemptable")

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("maxLogFileSize", h2b, iC(1))
        def checkSse(sseKey):
            with open(sseKey) as f:
                assert(len(f.readline().rstrip()) == 32)
        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)
        setOption("servicePollingInterval", float, fC(0.0))

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Example #10
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes  #This import is used to convert

        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError(
                            "The %s option has an invalid value: %s" %
                            (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x: human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        def parseJobStore(s):
            name, rest = Toil.parseLocator(s)
            if name == 'file':
                # We need to resolve relative paths early, on the leader, because the worker process
                # may have a different working directory than the leader, e.g. under Mesos.
                return Toil.buildLocator(name, os.path.abspath(rest))
            else:
                return s

        #Core options
        setOption("jobStore", parsingFn=parseJobStore)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        setOption("stats")
        setOption("cleanWorkDir")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError(
                    "Contradicting options passed: Clean flag is set to %s "
                    "despite the stats flag requiring "
                    "the jobStore to be intact at the end of the run. "
                    "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Autoscaling options
        setOption("provisioner")
        setOption("nodeType")
        setOption("nodeOptions")
        setOption("minNodes", int)
        setOption("maxNodes", int)
        setOption("preemptableNodeType")
        setOption("preemptableNodeOptions")
        setOption("minPreemptableNodes", int)
        setOption("maxPreemptableNodes", int)
        setOption("alphaPacking", float)
        setOption("betaInertia", float)
        setOption("scaleInterval", float)

        setOption("preemptableCompensation", float)
        require(0.0 <= self.preemptableCompensation <= 1.0,
                '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0',
                self.preemptableCompensation)

        # Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("readGlobalFileMutableByDefault")
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))
        setOption("defaultPreemptable")

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("disableCaching")
        setOption("maxLogFileSize", h2b, iC(1))

        def checkSse(sseKey):
            with open(sseKey) as f:
                assert (len(f.readline().rstrip()) == 32)

        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)
        setOption("servicePollingInterval", float, fC(0.0))

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Example #11
0
# See the License for the specific language governing permissions and
# limitations under the License.
"""A demonstration of toil. Sorts the lines of a file into ascending order by doing a parallel merge sort.
"""
from __future__ import absolute_import
from argparse import ArgumentParser
import os
import random
import logging
import shutil
from bd2k.util.humanize import human2bytes

from toil.job import Job
from toil.test.sort.lib import merge, sort, copySubRangeOfFile, getMidPoint

sortMemory = human2bytes('1000M')


def setup(job, inputFile, N, downCheckpoints):
    """Sets up the sort.
    """
    # insure default resource requirements are being set correctly
    assert job.cores is not None
    assert job.disk is not None
    assert job.preemptable is not None
    # insure user specified resource requirements are being set correctly
    assert job.memory is not None
    #Write the input file to the file store
    inputFileStoreID = job.fileStore.writeGlobalFile(inputFile, True)
    job.fileStore.logToMaster(" Starting the merge sort ")
    job.addFollowOnJobFn(
Example #12
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes  #This import is used to convert

        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName)
            if x != None:
                if parsingFn != None:
                    x = parsingFn(x)
                if checkFn != None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError(
                            "The %s option has an invalid value: %s" %
                            (varName, x))
                setattr(self, varName, x)

        h2b = lambda x: human2bytes(
            str(x)
        )  #Function to parse integer from string expressed in different formats

        def iC(minValue, maxValue=sys.maxint):
            #Returns function to check the a parameter is in a valid range
            def f(x):
                assert x >= minValue and x < maxValue

            return f

        #Core options
        setOption("jobStore",
                  parsingFn=lambda x: os.path.abspath(x)
                  if options.jobStore.startswith('.') else x)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        setOption("stats")
        setOption("clean")
        if self.stats:
            if self.clean != "never":
                raise RuntimeError(
                    "Contradicting options passed: Clean flag is set to %s "
                    "despite the stats flag requiring "
                    "the jobStore to be intact at the end of the run. "
                    "Set clean to \'never\'" % self.clean)

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float)
        setOption("masterIP")
        setOption("parasolCommand")

        #Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", h2b, iC(1))
        setOption("defaultDisk", h2b, iC(1))
        setOption("maxCores", h2b, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("maxLogFileSize", h2b, iC(1))

        def checkSse(sseKey):
            with open(sseKey) as f:
                assert (len(f.readline().rstrip()) == 32)

        setOption("sseKey", checkFn=checkSse)
Example #13
0
# See the License for the specific language governing permissions and
# limitations under the License.

"""A demonstration of toil. Sorts the lines of a file into ascending order by doing a parallel merge sort.
"""
from __future__ import absolute_import
from argparse import ArgumentParser
import os
import random
from bd2k.util.humanize import human2bytes

from toil.job import Job
from toil.test.sort.lib import merge, sort, copySubRangeOfFile, getMidPoint

success_ratio = 0.5
sortMemory = human2bytes('1000M')

def setup(job, inputFile, N):
    """Sets up the sort.
    """
    job.addFollowOnJobFn(cleanup, job.addChildJobFn(down, 
        inputFile, 0, os.path.getsize(inputFile), N).rv(), inputFile, memory=sortMemory)

def down(job, inputFile, fileStart, fileEnd, N):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > success_ratio:
Example #14
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes  #This import is used to convert

        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError(
                            "The %s option has an invalid value: %s" %
                            (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x: human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        #Core options
        setOption("jobStore",
                  parsingFn=lambda x: os.path.abspath(x)
                  if options.jobStore.startswith('.') else x)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        setOption("stats")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError(
                    "Contradicting options passed: Clean flag is set to %s "
                    "despite the stats flag requiring "
                    "the jobStore to be intact at the end of the run. "
                    "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("defaultCache", h2b, iC(0))
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("maxLogFileSize", h2b, iC(1))

        def checkSse(sseKey):
            with open(sseKey) as f:
                assert (len(f.readline().rstrip()) == 32)

        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Example #15
0
 def setOptions(self, options):
     """
     Creates a config object from the options object.
     """
     from bd2k.util.humanize import human2bytes #This import is used to convert
     #from human readable quantites to integers 
     def setOption(varName, parsingFn=None, checkFn=None):
         #If options object has the option "varName" specified
         #then set the "varName" attrib to this value in the config object
         x = getattr(options, varName, None)
         if x != None:
             if parsingFn != None:
                 x = parsingFn(x)
             if checkFn != None:
                 try:
                     checkFn(x)
                 except AssertionError:
                     raise RuntimeError("The %s option has an invalid value: %s" 
                                        % (varName, x))
             setattr(self, varName, x)
         
     h2b = lambda x : human2bytes(str(x)) #Function to parse integer from string expressed in different formats
     
     def iC(minValue, maxValue=sys.maxint):
         #Returns function to check the a parameter is in a valid range
         def f(x):
             assert x >= minValue and x < maxValue
         return f
     
     #Core options
     setOption("jobStore", parsingFn=lambda x : os.path.abspath(x) 
               if options.jobStore.startswith('.') else x)
     #TODO: LOG LEVEL STRING
     setOption("workDir")
     setOption("stats")
     setOption("clean")
     if self.stats:
         if self.clean != "never" and self.clean is not None:
             raise RuntimeError("Contradicting options passed: Clean flag is set to %s "
                                "despite the stats flag requiring "
                                "the jobStore to be intact at the end of the run. " 
                                "Set clean to \'never\'" % self.clean)
         self.clean = "never"
     elif self.clean is None:
         self.clean = "onSuccess"
     
     #Restarting the workflow options
     setOption("restart") 
     
     #Batch system options
     setOption("batchSystem")
     setOption("scale", float) 
     setOption("masterIP") 
     setOption("parasolCommand")
     
     #Resource requirements
     setOption("defaultMemory", h2b, iC(1))
     setOption("defaultCores", h2b, iC(1))
     setOption("defaultDisk", h2b, iC(1))
     setOption("maxCores", h2b, iC(1))
     setOption("maxMemory", h2b, iC(1))
     setOption("maxDisk", h2b, iC(1))
     
     #Retrying/rescuing jobs
     setOption("retryCount", int, iC(0))
     setOption("maxJobDuration", int, iC(1))
     setOption("rescueJobsFrequency", int, iC(1))
     
     #Misc
     setOption("maxLogFileSize", h2b, iC(1))
     def checkSse(sseKey):
         with open(sseKey) as f:
             assert(len(f.readline().rstrip()) == 32)
     setOption("sseKey", checkFn=checkSse)
     setOption("cseKey", checkFn=checkSse)
Example #16
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes #This import is used to convert
        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError("The %s option has an invalid value: %s"
                                           % (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x : human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        def parseJobStore(s):
            name, rest = Toil.parseLocator(s)
            if name == 'file':
                # We need to resolve relative paths early, on the leader, because the worker process
                # may have a different working directory than the leader, e.g. under Mesos.
                return Toil.buildLocator(name, os.path.abspath(rest))
            else:
                return s

        #Core options
        setOption("jobStore", parsingFn=parseJobStore)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        if self.workDir is not None:
            self.workDir = os.path.abspath(self.workDir)
            if not os.path.exists(self.workDir):
                raise RuntimeError("The path provided to --workDir (%s) does not exist."
                                   % self.workDir)
        setOption("stats")
        setOption("cleanWorkDir")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError("Contradicting options passed: Clean flag is set to %s "
                                   "despite the stats flag requiring "
                                   "the jobStore to be intact at the end of the run. "
                                   "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Autoscaling options
        setOption("provisioner")
        setOption("nodeType")
        setOption("nodeOptions")
        setOption("minNodes", int)
        setOption("maxNodes", int)
        setOption("preemptableNodeType")
        setOption("preemptableNodeOptions")
        setOption("minPreemptableNodes", int)
        setOption("maxPreemptableNodes", int)
        setOption("alphaPacking", float)
        setOption("betaInertia", float)
        setOption("scaleInterval", float)

        setOption("preemptableCompensation", float)
        require(0.0 <= self.preemptableCompensation <= 1.0,
                '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0',
                self.preemptableCompensation)
        
        # Parameters to limit service jobs / detect deadlocks
        setOption("maxServiceJobs", int)
        setOption("maxPreemptableServiceJobs", int)
        setOption("deadlockWait", int)

        # Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("readGlobalFileMutableByDefault")
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))
        setOption("defaultPreemptable")

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("disableCaching")
        setOption("maxLogFileSize", h2b, iC(1))
        def checkSse(sseKey):
            with open(sseKey) as f:
                assert(len(f.readline().rstrip()) == 32)
        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)
        setOption("servicePollingInterval", float, fC(0.0))

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Example #17
0
def main():
    """
    GATK germline pipeline with variant filtering and annotation.
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawTextHelpFormatter)

    # Generate subparsers
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the GATK germline pipeline')
    parser_run.add_argument(
        '--config',
        required=True,
        type=str,
        help='Path to the (filled in) config file, generated with '
        '"generate-config".')
    parser_run.add_argument(
        '--manifest',
        type=str,
        help='Path to the (filled in) manifest file, generated with '
        '"generate-manifest".\nDefault value: "%(default)s".')
    parser_run.add_argument(
        '--sample',
        default=None,
        nargs=2,
        type=str,
        help='Input sample identifier and BAM file URL or local path')
    parser_run.add_argument('--output-dir',
                            default=None,
                            help='Path/URL to output directory')
    parser_run.add_argument(
        '-s',
        '--suffix',
        default=None,
        help='Additional suffix to add to the names of the output files')
    parser_run.add_argument('--preprocess-only',
                            action='store_true',
                            help='Only runs preprocessing steps')

    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-germline.yaml'),
                      generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'),
                      generate_manifest)
    elif options.command == 'run':
        # Program checks
        for program in ['curl', 'docker']:
            require(
                next(which(program)),
                program + ' must be installed on every node.'.format(program))

        require(
            os.path.exists(options.config),
            '{} not found. Please run "generate-config"'.format(
                options.config))

        # Read sample manifest
        samples = []
        if options.manifest:
            samples.extend(parse_manifest(options.manifest))

        # Add BAM sample from command line
        if options.sample:
            uuid, url = options.sample
            # samples tuple: (uuid, url, paired_url, rg_line)
            # BAM samples should not have as paired URL or read group line
            samples.append(GermlineSample(uuid, url, None, None))

        require(
            len(samples) > 0,
            'No samples were detected in the manifest or on the command line')

        # Parse inputs
        inputs = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(options.config).read()).iteritems()
        }

        required_fields = {
            'genome_fasta', 'output_dir', 'run_bwa', 'sorted',
            'snp_filter_annotations', 'indel_filter_annotations', 'preprocess',
            'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator',
            'cores', 'file_size', 'xmx', 'suffix'
        }

        input_fields = set(inputs.keys())
        require(
            input_fields > required_fields,
            'Missing config parameters:\n{}'.format(', '.join(required_fields -
                                                              input_fields)))

        if inputs['output_dir'] is None:
            inputs['output_dir'] = options.output_dir

        require(inputs['output_dir'] is not None,
                'Missing output directory PATH/URL')

        if inputs['suffix'] is None:
            inputs['suffix'] = options.suffix if options.suffix else ''

        if inputs['preprocess_only'] is None:
            inputs['preprocess_only'] = options.preprocess_only

        if inputs['run_vqsr']:
            # Check that essential VQSR parameters are present
            vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'}
            require(
                input_fields > vqsr_fields,
                'Missing parameters for VQSR:\n{}'.format(
                    ', '.join(vqsr_fields - input_fields)))

        # Check that hard filtering parameters are present. If only running preprocessing steps, then we do
        # not need filtering information.
        elif not inputs['preprocess_only']:
            hard_filter_fields = {
                'snp_filter_name', 'snp_filter_expression',
                'indel_filter_name', 'indel_filter_expression'
            }
            require(
                input_fields > hard_filter_fields,
                'Missing parameters for hard filtering:\n{}'.format(
                    ', '.join(hard_filter_fields - input_fields)))

            # Check for falsey hard filtering parameters
            for hard_filter_field in hard_filter_fields:
                require(
                    inputs[hard_filter_field],
                    'Missing %s value for hard filtering, '
                    'got %s.' % (hard_filter_field, inputs[hard_filter_field]))

        # Set resource parameters
        inputs['xmx'] = human2bytes(inputs['xmx'])
        inputs['file_size'] = human2bytes(inputs['file_size'])
        inputs['cores'] = int(inputs['cores'])

        inputs['annotations'] = set(inputs['snp_filter_annotations'] +
                                    inputs['indel_filter_annotations'])

        # HaplotypeCaller test data for testing
        inputs['hc_output'] = inputs.get('hc_output', None)

        # It is a toil-scripts convention to store input parameters in a Namespace object
        config = argparse.Namespace(**inputs)

        root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config)
        Job.Runner.startToil(root, options)