Beispiel #1
0
def formatPairs(sample_pairs, work_mount):
    r1, r2 = [], []

    print('sample pairs:{}'.format(sample_pairs))
    log.info('sample pairs:{}'.format(sample_pairs))
    fastqs = sample_pairs.split(',')
    # Pattern convention: Look for "R1" / "R2" in the filename, or "_1" / "_2" before the extension
    pattern = re.compile('(?:^|[._-])(R[12]|[12]\.f)')
    for fastq in sorted(fastqs):
        match = pattern.search(os.path.basename(fastq))
        fastq = fileURL(fastq)
        if not match:
            log.info(
                'FASTQ file name fails to meet required convention for paired reads '
                '(see documentation). ' + fastq)
            exit(1)
        elif '1' in match.group():
            r1.append(fastq)
        elif '2' in match.group():
            r2.append(fastq)
        else:
            assert False, match.group()
    require(
        len(r1) == len(r2),
        'Check fastq names, uneven number of pairs found.\nr1: {}\nr2: {}'.
        format(r1, r2))
    interleaved_samples = zip(r1, r2)
    # flatten the list of tuples and join them into a comma delimited string
    # https://stackoverflow.com/questions/40993966/python-convert-tuple-to-comma-separated-string
    comma_delimited_samples = ','.join(
        map(str, chain.from_iterable(interleaved_samples)))
    log.info('comma delimited samples:{}'.format(comma_delimited_samples))
    return comma_delimited_samples
Beispiel #2
0
    def __init__(self, provisioner, jobBatcher, config):
        """
        Class manages automatically scaling the number of worker nodes.

        :param AbstractProvisioner provisioner: Provisioner instance to scale.

        :param JobBatcher jobBatcher: The class issuing jobs to the batch system. This is
               monitored to make scaling decisions.

        :param Config config: Config object from which to draw parameters.
        """
        self.provisioner = provisioner
        self.jobBatcher = jobBatcher
        self.config = config
        # Indicates that the scaling threads should shutdown
        self.stop = False

        assert config.maxPreemptableNodes >= 0 and config.maxNodes >= 0
        require(config.maxPreemptableNodes + config.maxNodes > 0,
                'Either --maxNodes or --maxPreemptableNodes must be non-zero.')

        if config.maxPreemptableNodes > 0:
            self.preemptableScaler = ScalerThread(self, preemptable=True)
            self.preemptableScaler.start()
        else:
            self.preemptableScaler = None

        if config.maxNodes > 0:
            self.scaler = ScalerThread(self, preemptable=False)
            self.scaler.start()
        else:
            self.scaler = None
Beispiel #3
0
    def __init__(self, provisioner, leader, config):
        """
        Class manages automatically scaling the number of worker nodes.
        :param AbstractProvisioner provisioner: Provisioner instance to scale.
        :param toil.leader.Leader leader: 
        :param Config config: Config object from which to draw parameters.
        """
        self.provisioner = provisioner
        self.leader = leader
        self.config = config
        # Indicates that the scaling threads should shutdown
        self.stop = False

        #Dictionary of job names to their average runtime, used to estimate wall time
        #of queued jobs for bin-packing
        self.jobNameToAvgRuntime = {}
        self.jobNameToNumCompleted = {}
        self.totalAvgRuntime = 0.0
        self.totalJobsCompleted = 0

        require(
            sum(config.maxNodes) > 0,
            'Not configured to create nodes of any type.')

        self.scaler = ScalerThread(scaler=self)
Beispiel #4
0
    def __init__(self, provisioner, jobBatcher, config):
        """
        Class manages automatically scaling the number of worker nodes.

        :param AbstractProvisioner provisioner: Provisioner instance to scale.

        :param JobBatcher jobBatcher: The class issuing jobs to the batch system. This is
               monitored to make scaling decisions.

        :param Config config: Config object from which to draw parameters.
        """
        self.provisioner = provisioner
        self.jobBatcher = jobBatcher
        self.config = config
        # Indicates that the scaling threads should shutdown
        self.stop = False

        assert config.maxPreemptableNodes >= 0 and config.maxNodes >= 0
        require(config.maxPreemptableNodes + config.maxNodes > 0,
                'Either --maxNodes or --maxPreemptableNodes must be non-zero.')

        if config.maxPreemptableNodes > 0:
            self.preemptableScaler = ScalerThread(self, preemptable=True)
            self.preemptableScaler.start()
        else:
            self.preemptableScaler = None

        if config.maxNodes > 0:
            self.scaler = ScalerThread(self, preemptable=False)
            self.scaler.start()
        else:
            self.scaler = None
Beispiel #5
0
 def _requireEphemeralDrives(self, workerType):
     require(
         workerType.disks > 0,
         "This provisioner only supports instance types with one or more ephemeral "
         "volumes. The requested type '%s' does not have any.",
         workerType.name)
     leaderType = self._resolveInstanceType(self._instance.instance_type)
     require(
         workerType.disks == leaderType.disks,
         'The instance type selected for worker nodes (%s) offers %i ephemeral volumes but '
         'this type of leader (%s) has %i. The number of drives must match between leader '
         'and worker nodes. Please specify a different worker node type or use a different '
         'leader.', workerType.name, workerType.disks, leaderType.name,
         leaderType.disks)
Beispiel #6
0
    def forModule(cls, name):
        """
        Return an instance of this class representing the module of the given name. If the given
        module name is "__main__", it will be translated to the actual file name of the top-level
        script without the .py or .pyc extension. This method assumes that the module with the
        specified name has already been loaded.
        """
        module = sys.modules[name]
        filePath = os.path.abspath(module.__file__)
        filePath = filePath.split(os.path.sep)
        filePath[-1], extension = os.path.splitext(filePath[-1])
        require(extension in ('.py', '.pyc'),
                'The name of a user script/module must end in .py or .pyc.')
        if name == '__main__':
            log.debug("Discovering real name of module")
            # User script/module was invoked as the main program
            if module.__package__:
                # Invoked as a module via python -m foo.bar
                log.debug("Script was invoked as a module")
                name = [filePath.pop()]
                for package in reversed(module.__package__.split('.')):
                    dirPathTail = filePath.pop()
                    assert dirPathTail == package
                    name.append(dirPathTail)
                name = '.'.join(reversed(name))
                dirPath = os.path.sep.join(filePath)
            else:
                # Invoked as a script via python foo/bar.py
                name = filePath.pop()
                dirPath = os.path.sep.join(filePath)
                cls._check_conflict(dirPath, name)
        else:
            # User module was imported. Determine the directory containing the top-level package
            if filePath[-1] == '__init__':
                # module is a subpackage
                filePath.pop()

            for package in reversed(name.split('.')):
                dirPathTail = filePath.pop()
                assert dirPathTail == package
            dirPath = os.path.sep.join(filePath)
        log.debug("Module dir is %s", dirPath)
        require(
            os.path.isdir(dirPath),
            'Bad directory path %s for module %s. Note that hot-deployment does not support \
                .egg-link files yet, or scripts located in the root directory.',
            dirPath, name)
        fromVirtualEnv = inVirtualEnv() and dirPath.startswith(sys.prefix)
        return cls(dirPath=dirPath, name=name, fromVirtualEnv=fromVirtualEnv)
Beispiel #7
0
    def forModule(cls, name):
        """
        Return an instance of this class representing the module of the given name. If the given
        module name is "__main__", it will be translated to the actual file name of the top-level
        script without the .py or .pyc extension. This method assumes that the module with the
        specified name has already been loaded.
        """
        module = sys.modules[name]
        filePath = os.path.abspath(module.__file__)
        filePath = filePath.split(os.path.sep)
        filePath[-1], extension = os.path.splitext(filePath[-1])
        require(extension in ('.py', '.pyc'),
                'The name of a user script/module must end in .py or .pyc.')
        if name == '__main__':
            log.debug("Discovering real name of module")
            # User script/module was invoked as the main program
            if module.__package__:
                # Invoked as a module via python -m foo.bar
                log.debug("Script was invoked as a module")
                name = [filePath.pop()]
                for package in reversed(module.__package__.split('.')):
                    dirPathTail = filePath.pop()
                    assert dirPathTail == package
                    name.append(dirPathTail)
                name = '.'.join(reversed(name))
                dirPath = os.path.sep.join(filePath)
            else:
                # Invoked as a script via python foo/bar.py
                name = filePath.pop()
                dirPath = os.path.sep.join(filePath)
                cls._check_conflict(dirPath, name)
        else:
            # User module was imported. Determine the directory containing the top-level package
            if filePath[-1] == '__init__':
                # module is a subpackage
                filePath.pop()

            for package in reversed(name.split('.')):
                dirPathTail = filePath.pop()
                assert dirPathTail == package
            dirPath = os.path.sep.join(filePath)
        log.debug("Module dir is %s", dirPath)
        require(os.path.isdir(dirPath),
                'Bad directory path %s for module %s. Note that hot-deployment does not support \
                .egg-link files yet, or scripts located in the root directory.', dirPath, name)
        fromVirtualEnv = inVirtualEnv() and dirPath.startswith(sys.prefix)
        return cls(dirPath=dirPath, name=name, fromVirtualEnv=fromVirtualEnv)
Beispiel #8
0
    def __init__(self, provisioner, leader, config):
        """
        Class manages automatically scaling the number of worker nodes.
        :param AbstractProvisioner provisioner: Provisioner instance to scale.
        :param toil.leader.Leader leader: 
        :param Config config: Config object from which to draw parameters.
        """
        self.provisioner = provisioner
        self.leader = leader
        self.config = config
        # Indicates that the scaling threads should shutdown
        self.stop = False

        assert config.maxPreemptableNodes >= 0 and config.maxNodes >= 0
        require(config.maxPreemptableNodes + config.maxNodes > 0,
                'Either --maxNodes or --maxPreemptableNodes must be non-zero.')
        
        self.preemptableScaler = ScalerThread(self, preemptable=True) if self.config.maxPreemptableNodes > 0 else None

        self.scaler = ScalerThread(self, preemptable=False) if self.config.maxNodes > 0 else None
Beispiel #9
0
 def forModule(cls, name):
     """
     Return an instance of this class representing the module of the given name. If the given
     module name is "__main__", it will be translated to the actual file name of the top-level
     script without the .py or .pyc extension. This method assumes that the module with the
     specified name has already been loaded.
     """
     module = sys.modules[name]
     filePath = os.path.abspath(module.__file__)
     filePath = filePath.split(os.path.sep)
     filePath[-1], extension = os.path.splitext(filePath[-1])
     require(extension in ('.py', '.pyc'),
             'The name of a user script/module must end in .py or .pyc.')
     if name == '__main__':
         # User script/module was invoked as the main program
         if module.__package__:
             # Invoked as a module via python -m foo.bar
             name = [filePath.pop()]
             for package in reversed(module.__package__.split('.')):
                 dirPathTail = filePath.pop()
                 assert dirPathTail == package
                 name.append(dirPathTail)
             name = '.'.join(reversed(name))
             dirPath = os.path.sep.join(filePath)
         else:
             # Invoked as a script via python foo/bar.py
             name = filePath.pop()
             dirPath = os.path.sep.join(filePath)
             cls._check_conflict(dirPath, name)
     else:
         # User module was imported. Determine the directory containing the top-level package
         for package in reversed(name.split('.')):
             dirPathTail = filePath.pop()
             assert dirPathTail == package
         dirPath = os.path.sep.join(filePath)
     assert os.path.isdir(dirPath)
     return cls(dirPath=dirPath, name=name)
Beispiel #10
0
 def __init__(self, config, batchSystem):
     """
     :type config: Config
     :type batchSystem: AbstractBatchSystem
     """
     super(CGCloudProvisioner, self).__init__(config, batchSystem)
     self.batchSystem = batchSystem
     self.imageId = self._instance.image_id
     require(config.nodeType,
             'Must pass --nodeType when using the cgcloud provisioner')
     instanceType = self._resolveInstanceType(config.nodeType)
     self._requireEphemeralDrives(instanceType)
     if config.preemptableNodeType:
         try:
             preemptableInstanceType, spotBid = config.preemptableNodeType.split(
                 ':')
         except ValueError:
             raise ValueError(
                 "Preemptible node type '%s' is not valid for this provisioner. "
                 "Use format INSTANCE_TYPE:SPOT_BID, e.g. m3.large:0.10 instead"
                 % config.preemptableNodeType)
         preemptableInstanceType = self._resolveInstanceType(
             preemptableInstanceType)
         self._requireEphemeralDrives(preemptableInstanceType)
         try:
             self.spotBid = float(spotBid)
         except ValueError:
             raise ValueError(
                 "The spot bid '%s' is not valid. Use a floating point dollar "
                 "amount such as '0.42' instead." % spotBid)
     else:
         preemptableInstanceType, self.spotBid = None, None
     self.instanceType = {
         False: instanceType,
         True: preemptableInstanceType
     }
Beispiel #11
0
 def resume(self):
     if not os.path.exists(self.jobStoreDir):
         raise NoSuchJobStoreException(self.jobStoreDir)
     require(os.path.isdir, "'%s' is not a directory", self.jobStoreDir)
     logger.debug("Resuming...")
Beispiel #12
0
 def resume(self):
     if not os.path.exists(self.jobStoreDir):
         raise NoSuchJobStoreException(self.jobStoreDir)
     require( os.path.isdir, "'%s' is not a directory", self.jobStoreDir)
     super(FileJobStore, self).resume()
Beispiel #13
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes  #This import is used to convert

        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError(
                            "The %s option has an invalid value: %s" %
                            (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x: human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        def parseJobStore(s):
            name, rest = Toil.parseLocator(s)
            if name == 'file':
                # We need to resolve relative paths early, on the leader, because the worker process
                # may have a different working directory than the leader, e.g. under Mesos.
                return Toil.buildLocator(name, os.path.abspath(rest))
            else:
                return s

        #Core options
        setOption("jobStore", parsingFn=parseJobStore)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        setOption("stats")
        setOption("cleanWorkDir")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError(
                    "Contradicting options passed: Clean flag is set to %s "
                    "despite the stats flag requiring "
                    "the jobStore to be intact at the end of the run. "
                    "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Autoscaling options
        setOption("provisioner")
        setOption("nodeType")
        setOption("nodeOptions")
        setOption("minNodes", int)
        setOption("maxNodes", int)
        setOption("preemptableNodeType")
        setOption("preemptableNodeOptions")
        setOption("minPreemptableNodes", int)
        setOption("maxPreemptableNodes", int)
        setOption("alphaPacking", float)
        setOption("betaInertia", float)
        setOption("scaleInterval", float)

        setOption("preemptableCompensation", float)
        require(0.0 <= self.preemptableCompensation <= 1.0,
                '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0',
                self.preemptableCompensation)

        # Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("readGlobalFileMutableByDefault")
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))
        setOption("defaultPreemptable")

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("disableCaching")
        setOption("maxLogFileSize", h2b, iC(1))

        def checkSse(sseKey):
            with open(sseKey) as f:
                assert (len(f.readline().rstrip()) == 32)

        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)
        setOption("servicePollingInterval", float, fC(0.0))

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Beispiel #14
0
def docker_call(job,
                tool,
                parameters=None,
                work_dir='.',
                rm=True,
                detached=False,
                env=None,
                outfile=None,
                inputs=None,
                outputs=None,
                docker_parameters=None,
                check_output=False,
                mock=None,
                defer=None,
                container_name=None,
                mounts=None):
    """
    Calls Docker, passing along parameters and tool.

    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
    :param list[str] parameters: Command line arguments to be passed to the tool
    :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data
    :param bool rm: Should the container be run with the --rm flag (Should it be removed upon
           container exit)? rm and detached are mutually exclusive in Docker.  This is the flag
           passed to docker and is independent of the defer flag.  If this is set to True and
           `defer` is None, `defer` takes the value `docker_call.RM`.
    :param bool detached: Should the container be run with the --detached flag (Should it be run in
           detached mode)? See `rm` above.
    :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G'))
    :param file outfile: Pipe output of Docker call to file handle
    :param list[str] inputs: A list of the input files.
    :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None
           or a url. The value is only used if mock=True
    :param dict[str,str] docker_parameters: Parameters to pass to docker
    :param bool check_output: When True, this function returns docker's output
    :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by
           the environment variable.
    :param int defer: What action should be taken on the container upon job completion?
           docker_call.FORGO will leave the container untouched.
           docker_call.STOP will attempt to stop the container with `docker stop` (useful for
           debugging).
           docker_call.RM will stop the container and then forcefully remove it from the system
           using `docker rm -f`.
           The default value is None and that shadows docker_call.FORGO, unless rm is true.
    :param str container_name: An optional name for your container.
    :param dict mounts: A dictionary of data volumes to mount into the Docker container containing host paths
           as keys and the corresponding container paths as values
    """
    from toil_lib.urls import download_url

    if mock is None:
        mock = mock_mode()
    if parameters is None:
        parameters = []
    if inputs is None:
        inputs = []
    if outputs is None:
        outputs = {}

    # Docker does not allow the --rm flag to be used when the container is run in detached mode.
    require(not (rm and detached), "Conflicting options 'rm' and 'detached'.")
    # Ensure the user has passed a valid value for defer
    require(
        defer in (None, docker_call.FORGO, docker_call.STOP, docker_call.RM),
        'Please provide a valid value for defer.')

    for filename in inputs:
        assert (os.path.isfile(os.path.join(work_dir, filename)))

    if mock:
        for filename, url in outputs.items():
            file_path = os.path.join(work_dir, filename)
            if url is None:
                # create mock file
                if not os.path.exists(file_path):
                    f = open(file_path, 'w')
                    f.write("contents")  # FIXME
                    f.close()

            else:
                file_path = os.path.join(work_dir, filename)
                if not os.path.exists(file_path):
                    outfile = download_url(job,
                                           url,
                                           work_dir=work_dir,
                                           name=filename,
                                           mock=False)
                assert os.path.exists(file_path)
        return

    if not container_name:
        container_name = _get_container_name(job)
    base_docker_call = [
        'docker', 'run', '--log-driver=none', '-v',
        '{}:/data'.format(os.path.abspath(work_dir))
    ]
    if mounts:
        require(isinstance(mounts, dict),
                "'mounts' parameter must be a dictionary object")
        for k, v in mounts.iteritems():
            base_docker_call.extend(['-v', k + ':' + v])

    # Defer the permission fixing function.  We call this explicitly later on in this function, but
    # we defer it as well to handle unexpected job failure.
    job.defer(_fix_permissions, base_docker_call, tool, work_dir)

    base_docker_call.extend(['--name', container_name])
    if rm:
        base_docker_call.append('--rm')
        if defer is None:
            defer = docker_call.RM
    elif detached:
        base_docker_call += ['-d']
    # Defer the container on-exit action
    job.defer(_docker_kill, container_name, action=defer)

    if env:
        for e, v in env.iteritems():
            base_docker_call.extend(['-e', '{}={}'.format(e, v)])
    if docker_parameters:
        base_docker_call += docker_parameters

    _log.debug("Calling docker with %s." %
               " ".join(base_docker_call + [tool] + parameters))

    call = base_docker_call + [tool] + parameters

    if outfile:
        subprocess.check_call(call, stdout=outfile)
    else:
        if check_output:
            return subprocess.check_output(call)
        else:
            subprocess.check_call(call)
    # Fix root ownership of output files
    _fix_permissions(base_docker_call, tool, work_dir)

    for filename in outputs.keys():
        if not os.path.isabs(filename):
            filename = os.path.join(work_dir, filename)
        assert (os.path.isfile(filename))
Beispiel #15
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Dockerized Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General Usage:
    docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \
    quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar

    Please see the complete documentation located at:
    https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline
    or inside the container at: /opt/rnaseq-pipeline/README.md


    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 8
                 \          |
                  7 ---------

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = FastQC
    8 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Docker
    """
    # Define argument parser for
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--sample-tar',
                        default=[],
                        action="append",
                        help='Absolute path to sample tarball.')
    parser.add_argument('--sample-single',
                        default=[],
                        action="append",
                        help='Absolute path to sample single-ended FASTQ.')
    parser.add_argument(
        '--sample-paired',
        default=[],
        action="append",
        help=
        'Absolute path to sample paired FASTQs, in the form `read1,read2,read1,read2`.'
    )
    parser.add_argument('--star',
                        type=str,
                        required=True,
                        help='Absolute path to STAR index tarball.')
    parser.add_argument('--rsem',
                        type=str,
                        required=True,
                        help='Absolute path to rsem reference tarball.')
    parser.add_argument('--kallisto',
                        type=str,
                        required=True,
                        help='Absolute path to kallisto index (.idx) file.')
    parser.add_argument(
        '--disable-cutadapt',
        action='store_true',
        default=False,
        help=
        'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.'
    )
    parser.add_argument(
        '--save-bam',
        action='store_true',
        default='false',
        help='If this flag is used, genome-aligned bam is written to output.')
    parser.add_argument(
        '--save-wiggle',
        action='store_true',
        default='false',
        help='If this flag is used, wiggle files (.bg) are written to output.')
    parser.add_argument(
        '--no-clean',
        action='store_true',
        help='If this flag is used, temporary work directory is not cleaned.')
    parser.add_argument(
        '--resume',
        type=str,
        default=None,
        help=
        'Pass the working directory that contains a job store to be resumed.')
    parser.add_argument(
        '--cores',
        type=int,
        default=None,
        help=
        'Will set a cap on number of cores to use, default is all available cores.'
    )
    parser.add_argument('--bamqc',
                        action='store_true',
                        default=None,
                        help='Enable BAM QC step. Disabled by default')
    parser.add_argument(
        '--work_mount',
        required=True,
        help='Mount where intermediate files should be written. This directory '
        'should be mirror mounted into the container.')
    parser.add_argument('--output-basename',
                        default="",
                        help='Base name to use for naming the output files ')
    # although we don't actually set the log level in this module, the option is propagated to toil. For this reason
    # we want the logging options to show up with we run --help
    addLoggingOptions(parser)
    toilLoggingOption = None
    for arg in sys.argv:
        if 'log' in arg:
            toilLoggingOption = arg
            sys.argv.remove(toilLoggingOption)
            break
    args = parser.parse_args()
    args.toilLoggingOption = toilLoggingOption
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Get name of most recent running container. If socket is mounted, should be this one.
    try:
        name = subprocess.check_output(
            ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0]
    except subprocess.CalledProcessError as e:
        raise RuntimeError(
            'No container detected, ensure Docker is being run with: '
            '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'
            .format(e.message))
    # Get name of mounted volume
    blob = json.loads(subprocess.check_output(['docker', 'inspect', name]))
    mounts = blob[0]['Mounts']
    # Ensure docker.sock is mounted correctly
    sock_mount = [
        x['Source'] == x['Destination'] for x in mounts
        if 'docker.sock' in x['Source']
    ]
    require(
        len(sock_mount) == 1, 'Missing socket mount. Requires the following: '
        'docker run -v /var/run/docker.sock:/var/run/docker.sock')
    work_mount = args.work_mount
    #create work_mount directories if they don't exist yet.
    cmd = ["mkdir", "-p", work_mount]
    log.info('Creating directory: %s', work_mount)
    subprocess.call(cmd)
    curr_mount = os.path.join(os.getcwd(), work_mount)
    cmd = ["mkdir", "-p", curr_mount]
    log.info('Creating directory: %s', curr_mount)
    subprocess.call(cmd)

    for samples in [args.sample_tar, args.sample_paired, args.sample_single]:
        if not samples:
            continue
        # If sample is given as relative path, assume it's in the work directory
        if not all(x.startswith('/') for x in samples):
            samples = [
                os.path.join(work_mount, x) for x in samples
                if not x.startswith('/')
            ]
            log.info(
                '\nSample given as relative path, assuming sample is in work directory: {}'
                .format(work_mount[0]))
        # Enforce file input standards
        require(
            all(x.startswith('/') for x in samples),
            "Sample inputs must point to a file's full path, "
            "e.g. '/full/path/to/sample1.tar'. You provided %s", str(samples))
        if samples == args.sample_tar:
            log.info('TARs to run: {}'.format('\t'.join(args.sample_tar)))
        if samples == args.sample_paired:
            log.info('Paired FASTQS to run: {}'.format('\t'.join(
                args.sample_paired)))
        if samples == args.sample_single:
            log.info('Single FASTQS to run: {}'.format('\t'.join(
                args.sample_single)))
    require(
        all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]),
        "Sample inputs must point to a file's full path, "
        "e.g. '/full/path/to/kallisto_hg38.idx'.")
    # Output log information
    log.info('The work mount is: {}'.format(work_mount))
    log.info('Pipeline input locations: \n{}\n{}\n{}'.format(
        args.star, args.rsem, args.kallisto))
    call_pipeline(work_mount, args)
Beispiel #16
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Dockerized Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General Usage:
    docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \
    quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar

    Please see the complete documentation located at:
    https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline
    or inside the container at: /opt/rnaseq-pipeline/README.md


    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 8
                 \          |
                  7 ---------

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = FastQC
    8 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Docker
    """
    # Define argument parser for
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--sample-tar',
                        default=[],
                        action="append",
                        help='Absolute path to sample tarball.')
    parser.add_argument('--sample-single',
                        default=[],
                        action="append",
                        help='Absolute path to sample single-ended FASTQ.')
    parser.add_argument(
        '--sample-paired',
        nargs='*',
        default=[],
        help=
        'Absolute path to sample paired FASTQs, in the form `read1,read2,read1,read2`.'
    )
    parser.add_argument('--output-basenames',
                        nargs='*',
                        default=[],
                        help='Base names to use for naming the output files ')

    parser.add_argument('--star',
                        type=str,
                        default="",
                        help='Absolute path to STAR index tarball.')
    parser.add_argument('--rsem',
                        type=str,
                        default="",
                        help='Absolute path to rsem reference tarball.')
    parser.add_argument('--kallisto',
                        type=str,
                        default="",
                        help='Absolute path to kallisto index (.idx) file.')
    parser.add_argument('--hera',
                        type=str,
                        default="",
                        help='Absolute path to hera index (.idx) file.')
    parser.add_argument(
        '--disable-cutadapt',
        action='store_true',
        default=False,
        help=
        'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.'
    )
    parser.add_argument(
        '--save-bam',
        action='store_true',
        default='false',
        help='If this flag is used, genome-aligned bam is written to output.')
    parser.add_argument(
        '--save-wiggle',
        action='store_true',
        default='false',
        help='If this flag is used, wiggle files (.bg) are written to output.')
    parser.add_argument(
        '--no-clean',
        action='store_true',
        help='If this flag is used, temporary work directory is not cleaned.')
    parser.add_argument(
        '--resume',
        type=str,
        default=None,
        help=
        'Pass the working directory that contains a job store to be resumed.')
    parser.add_argument(
        '--cores',
        type=int,
        default=None,
        help=
        'Will set a cap on number of cores to use, default is all available cores.'
    )
    parser.add_argument('--bamqc',
                        action='store_true',
                        default=None,
                        help='Enable BAM QC step. Disabled by default')
    parser.add_argument(
        '--work_mount',
        required=True,
        help='Mount where intermediate files should be written. This directory '
        'should be mirror mounted into the container.')
    parser.add_argument(
        '--max-sample-size',
        default="20G",
        help='Maximum size of sample file using Toil resource requirements '
        "syntax, e.g '20G'. Standard suffixes like K, Ki, M, Mi, G or Gi are supported."
    )

    auto_scale_options = parser.add_argument_group('Auto-scaling options')
    auto_scale_options.add_argument(
        '--auto-scale',
        action='store_true',
        default=False,
        help='Enable Toil autoscaling. Disabled by default')
    auto_scale_options.add_argument(
        '--cluster-name',
        default="",
        help='Name of the Toil cluster. Usually the security group name')
    auto_scale_options.add_argument(
        '--job-store',
        default="aws:us-west-2:autoscaling-toil-rnaseq-jobstore-2",
        help='Directory in cloud where working files will be put; '
        'e.g. aws:us-west-2:autoscaling-toil-rnaseq-jobstore')
    auto_scale_options.add_argument(
        '--output-location',
        default="s3://toil-rnaseq-cloud-staging-area",
        help='Directory in cloud where  output files will be put; '
        'e.g. s3://toil-rnaseq-cloud-staging-area')
    auto_scale_options.add_argument('--provisioner',
                                    default="aws",
                                    help='Cloud provisioner to use. E.g aws')
    auto_scale_options.add_argument(
        '--node-type',
        default="c3.8xlarge",
        help='Cloud worker VM type; e.g. c3.8xlarge')
    auto_scale_options.add_argument(
        '--max-nodes',
        type=int,
        default=2,
        help='Maximum worker nodes to launch. E.g. 2')
    auto_scale_options.add_argument('--credentials-id',
                                    default="",
                                    help='Credentials id')
    auto_scale_options.add_argument('--credentials-secret-key',
                                    default="",
                                    help='Credentials secret key')

    # although we don't actually set the log level in this module, the option is propagated to toil. For this reason
    # we want the logging options to show up with we run --help
    addLoggingOptions(parser)
    toilLoggingOption = '--logDebug'
    for arg in sys.argv:
        if 'log' in arg:
            toilLoggingOption = arg
            sys.argv.remove(toilLoggingOption)
            break
    args = parser.parse_args()
    args.toilLoggingOption = toilLoggingOption
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    if args.auto_scale:
        if not args.cluster_name:
            log.info(
                'Auto-scaling requires a cluster name to be input with the --cluster-name option'
            )
            parser.error(
                'Auto-scaling requires a cluster name to be input with the --cluster-name option'
            )
        if not args.credentials_id or not args.credentials_secret_key:
            log.info(
                'Auto-scaling requires provisioner credentials id and secret key'
            )
            parser.error(
                'Auto-scaling requires provisioner credentials id and secret key'
            )

    # Get name of most recent running container. If socket is mounted, should be this one.
    try:
        name = subprocess.check_output(
            ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0]
    except subprocess.CalledProcessError as e:
        raise RuntimeError(
            'No container detected, ensure Docker is being run with: '
            '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'
            .format(e.message))
    # Get name of mounted volume
    blob = json.loads(subprocess.check_output(['docker', 'inspect', name]))
    mounts = blob[0]['Mounts']
    # Ensure docker.sock is mounted correctly
    sock_mount = [
        x['Source'] == x['Destination'] for x in mounts
        if 'docker.sock' in x['Source']
    ]
    require(
        len(sock_mount) == 1, 'Missing socket mount. Requires the following: '
        'docker run -v /var/run/docker.sock:/var/run/docker.sock')
    work_mount = args.work_mount
    for samples in [args.sample_tar, args.sample_paired, args.sample_single]:
        if not samples:
            continue

        # Enforce file input standards
        if args.auto_scale:
            require(
                len(args.output_basenames) == len(samples), "There must be a "
                "unique output filename for each sample. You provided {}".
                format(args.output_basenames))

            require(all( ((x.lower().startswith('http://') or x.lower().startswith('s3://') \
                or x.lower().startswith('ftp://')) or not x) for x in samples),
            "Sample inputs must point to a file's full path, "
            "e.g. 's3://full/path/to/sample_R1.fastq.gz', and should start with "
            " file://, http://, s3://, or ftp://.  You provided %s", str(samples))
        else:
            # If sample is given as relative path, assume it's in the work directory
            if not all(x.startswith('/') for x in samples):
                samples = [
                    os.path.join(work_mount, x) for x in samples
                    if not x.startswith('/')
                ]
                log.info(
                    '\nSample given as relative path, assuming sample is in work directory: {}'
                    .format(work_mount[0]))

            require(
                all(x.startswith('/') for x in samples),
                "Sample inputs must point to a file's full path, "
                "e.g. '/full/path/to/sample1.tar'. You provided %s",
                str(samples))
        if samples == args.sample_tar:
            log.info('TARs to run: {}'.format('\t'.join(args.sample_tar)))
        if samples == args.sample_paired:
            log.info('Paired FASTQS to run: {}'.format('\t'.join(
                args.sample_paired)))
        if samples == args.sample_single:
            log.info('Single FASTQS to run: {}'.format('\t'.join(
                args.sample_single)))

    #file paths should start with /, file://, http://, s3://, or ftp://
    if args.auto_scale:
        require(all( ((x.lower().startswith('http://') or x.lower().startswith('s3://') \
                or x.lower().startswith('ftp://')) or not x) for x in [args.star, \
                             args.kallisto, args.rsem, args.hera]),
            "Sample inputs must point to a file's full path, "
            "e.g. 's3://full/path/to/kallisto_hg38.idx', and should start with file://, http://, s3://, or ftp://.")
    else:
        #Input for star and rsem will be empty if user wants to run kallisto only so test for not x
        require(
            all((x.startswith('/') or not x)
                for x in [args.star, args.kallisto, args.rsem, args.hera]),
            "Sample inputs must point to a file's full path, "
            "e.g. '/full/path/to/kallisto_hg38.idx'")

    # Output log information
    log.info('The work mount is: {}'.format(work_mount))
    log.info('Pipeline input locations: \n{}\n{}\n{}\n{}'.format(
        args.star, args.rsem, args.kallisto, args.hera))
    call_pipeline(work_mount, args)
Beispiel #17
0
    def setOptions(self, options):
        """
        Creates a config object from the options object.
        """
        from bd2k.util.humanize import human2bytes #This import is used to convert
        #from human readable quantites to integers
        def setOption(varName, parsingFn=None, checkFn=None):
            #If options object has the option "varName" specified
            #then set the "varName" attrib to this value in the config object
            x = getattr(options, varName, None)
            if x is not None:
                if parsingFn is not None:
                    x = parsingFn(x)
                if checkFn is not None:
                    try:
                        checkFn(x)
                    except AssertionError:
                        raise RuntimeError("The %s option has an invalid value: %s"
                                           % (varName, x))
                setattr(self, varName, x)

        # Function to parse integer from string expressed in different formats
        h2b = lambda x : human2bytes(str(x))

        def iC(minValue, maxValue=sys.maxint):
            # Returns function that checks if a given int is in the given half-open interval
            assert isinstance(minValue, int) and isinstance(maxValue, int)
            return lambda x: minValue <= x < maxValue

        def fC(minValue, maxValue=None):
            # Returns function that checks if a given float is in the given half-open interval
            assert isinstance(minValue, float)
            if maxValue is None:
                return lambda x: minValue <= x
            else:
                assert isinstance(maxValue, float)
                return lambda x: minValue <= x < maxValue

        def parseJobStore(s):
            name, rest = Toil.parseLocator(s)
            if name == 'file':
                # We need to resolve relative paths early, on the leader, because the worker process
                # may have a different working directory than the leader, e.g. under Mesos.
                return Toil.buildLocator(name, os.path.abspath(rest))
            else:
                return s

        #Core options
        setOption("jobStore", parsingFn=parseJobStore)
        #TODO: LOG LEVEL STRING
        setOption("workDir")
        if self.workDir is not None:
            self.workDir = os.path.abspath(self.workDir)
            if not os.path.exists(self.workDir):
                raise RuntimeError("The path provided to --workDir (%s) does not exist."
                                   % self.workDir)
        setOption("stats")
        setOption("cleanWorkDir")
        setOption("clean")
        if self.stats:
            if self.clean != "never" and self.clean is not None:
                raise RuntimeError("Contradicting options passed: Clean flag is set to %s "
                                   "despite the stats flag requiring "
                                   "the jobStore to be intact at the end of the run. "
                                   "Set clean to \'never\'" % self.clean)
            self.clean = "never"
        elif self.clean is None:
            self.clean = "onSuccess"

        #Restarting the workflow options
        setOption("restart")

        #Batch system options
        setOption("batchSystem")
        setOption("scale", float, fC(0.0))
        setOption("mesosMasterAddress")
        setOption("parasolCommand")
        setOption("parasolMaxBatches", int, iC(1))

        setOption("environment", parseSetEnv)

        #Autoscaling options
        setOption("provisioner")
        setOption("nodeType")
        setOption("nodeOptions")
        setOption("minNodes", int)
        setOption("maxNodes", int)
        setOption("preemptableNodeType")
        setOption("preemptableNodeOptions")
        setOption("minPreemptableNodes", int)
        setOption("maxPreemptableNodes", int)
        setOption("alphaPacking", float)
        setOption("betaInertia", float)
        setOption("scaleInterval", float)

        setOption("preemptableCompensation", float)
        require(0.0 <= self.preemptableCompensation <= 1.0,
                '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0',
                self.preemptableCompensation)
        
        # Parameters to limit service jobs / detect deadlocks
        setOption("maxServiceJobs", int)
        setOption("maxPreemptableServiceJobs", int)
        setOption("deadlockWait", int)

        # Resource requirements
        setOption("defaultMemory", h2b, iC(1))
        setOption("defaultCores", float, fC(1.0))
        setOption("defaultDisk", h2b, iC(1))
        setOption("readGlobalFileMutableByDefault")
        setOption("maxCores", int, iC(1))
        setOption("maxMemory", h2b, iC(1))
        setOption("maxDisk", h2b, iC(1))
        setOption("defaultPreemptable")

        #Retrying/rescuing jobs
        setOption("retryCount", int, iC(0))
        setOption("maxJobDuration", int, iC(1))
        setOption("rescueJobsFrequency", int, iC(1))

        #Misc
        setOption("disableCaching")
        setOption("maxLogFileSize", h2b, iC(1))
        def checkSse(sseKey):
            with open(sseKey) as f:
                assert(len(f.readline().rstrip()) == 32)
        setOption("sseKey", checkFn=checkSse)
        setOption("cseKey", checkFn=checkSse)
        setOption("servicePollingInterval", float, fC(0.0))

        #Debug options
        setOption("badWorker", float, fC(0.0, 1.0))
        setOption("badWorkerFailInterval", float, fC(0.0))
Beispiel #18
0
def _docker(job,
            tool,
            parameters=None,
            workDir=None,
            dockerParameters=None,
            outfile=None,
            checkOutput=False,
            defer=None):
    """
    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools).
    :param list[str] parameters: Command line arguments to be passed to the tool.
           If list of lists: list[list[str]], then treat as successive commands chained with pipe.
    :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data
    :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`,
            `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention.
             These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired.
    :param file outfile: Pipe output of Docker call to file handle
    :param bool checkOutput: When True, this function returns docker's output.
    :param int defer: What action should be taken on the container upon job completion?
           FORGO (0) will leave the container untouched.
           STOP (1) will attempt to stop the container with `docker stop` (useful for debugging).
           RM (2) will stop the container and then forcefully remove it from the system
           using `docker rm -f`. This is the default behavior if defer is set to None.
    """
    if parameters is None:
        parameters = []
    if workDir is None:
        workDir = os.getcwd()

    # Setup the outgoing subprocess call for docker
    baseDockerCall = ['docker', 'run']
    if dockerParameters:
        baseDockerCall += dockerParameters
    else:
        baseDockerCall += [
            '--rm', '--log-driver', 'none', '-v',
            os.path.abspath(workDir) + ':/data'
        ]

    # Ensure the user has passed a valid value for defer
    require(defer in (None, FORGO, STOP, RM),
            'Please provide a valid value for defer.')

    # Get container name which is needed for _dockerKill
    try:
        if any('--name' in x for x in baseDockerCall):
            if any('--name=' in x for x in baseDockerCall):
                containerName = [
                    x.split('=')[1] for x in baseDockerCall if '--name' in x
                ][0]
            else:
                containerName = baseDockerCall[baseDockerCall.index('--name') +
                                               1]
        else:
            containerName = _getContainerName(job)
    except ValueError:
        containerName = _getContainerName(job)
        baseDockerCall.extend(['--name', containerName])
    except IndexError:
        raise RuntimeError(
            "Couldn't parse Docker's `--name=` option, check parameters: " +
            str(dockerParameters))

    # Defer the container on-exit action
    if '--rm' in baseDockerCall and defer is None:
        defer = RM
    if '--rm' in baseDockerCall and defer is not RM:
        _logger.warn(
            '--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: '
            + str(defer))
    job.defer(_dockerKill, containerName, action=defer)
    # Defer the permission fixing function which will run after this job concludes.
    # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure.
    job.defer(_fixPermissions, tool, workDir)

    # Make subprocess call

    # If parameters is list of lists, treat each list as separate command and chain with pipes
    if len(parameters) > 0 and type(parameters[0]) is list:
        # When piping, all arguments now get merged into a single string to bash -c.
        # We try to support spaces in paths by wrapping them all in quotes first.
        chain_params = [
            ' '.join(p) for p in [map(pipes.quote, q) for q in parameters]
        ]
        call = baseDockerCall + [
            '--entrypoint', '/bin/bash', tool, '-c', ' | '.join(chain_params)
        ]
    else:
        call = baseDockerCall + [tool] + parameters
    _logger.info("Calling docker with " + repr(call))

    params = {}
    if outfile:
        params['stdout'] = outfile
    if checkOutput:
        callMethod = subprocess.check_output
    else:
        callMethod = subprocess.check_call

    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            out = callMethod(call, **params)

    _fixPermissions(tool=tool, workDir=workDir)
    return out
Beispiel #19
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Dockerized Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General Usage:
    docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \
    quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar

    Please see the complete documentation located at:
    https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline
    or inside the container at: /opt/rnaseq-pipeline/README.md


    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 8
                 \          |
                  7 ---------

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = FastQC
    8 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Docker
    """
    # Define argument parser for
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--samples', nargs='+', required=True,
                        help='Absolute path(s) to sample tarballs.')
    parser.add_argument('--star', type=str, required=True,
                        help='Absolute path to STAR index tarball.')
    parser.add_argument('--rsem', type=str, required=True,
                        help='Absolute path to rsem reference tarball.')
    parser.add_argument('--kallisto', type=str, required=True,
                        help='Absolute path to kallisto index (.idx) file.')
    parser.add_argument('--disable-cutadapt', action='store_true', default=False,
                        help='Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.')
    parser.add_argument('--save-bam', action='store_true', default='false',
                        help='If this flag is used, genome-aligned bam is written to output.')
    parser.add_argument('--save-wiggle', action='store_true', default='false',
                        help='If this flag is used, wiggle files (.bg) are written to output.')
    parser.add_argument('--no-clean', action='store_true',
                        help='If this flag is used, temporary work directory is not cleaned.')
    parser.add_argument('--resume', type=str, default=None,
                        help='Pass the working directory that contains a job store to be resumed.')
    parser.add_argument('--cores', type=int, default=None,
                        help='Will set a cap on number of cores to use, default is all available cores.')
    args = parser.parse_args()
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Get name of most recent running container. If socket is mounted, should be this one.
    try:
        name = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0]
    except subprocess.CalledProcessError as e:
        raise RuntimeError('No container detected, ensure Docker is being run with: '
                           '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'.format(e.message))
    # Get name of mounted volume
    blob = json.loads(subprocess.check_output(['docker', 'inspect', name]))
    mounts = blob[0]['Mounts']
    # Ensure docker.sock is mounted correctly
    sock_mount = [x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source']]
    require(len(sock_mount) == 1, 'Missing socket mount. Requires the following: '
                                  'docker run -v /var/run/docker.sock:/var/run/docker.sock')
    # Ensure formatting of command for 2 mount points
    if len(mounts) == 2:
        require(all(x['Source'] == x['Destination'] for x in mounts),
                'Docker Src/Dst mount points, invoked with the -v argument, '
                'must be the same if only using one mount point aside from the docker socket.')
        work_mount = [x['Source'] for x in mounts if 'docker.sock' not in x['Source']]
    else:
        # Ensure only one mirror mount exists aside from docker.sock
        mirror_mounts = [x['Source'] for x in mounts if x['Source'] == x['Destination']]
        work_mount = [x for x in mirror_mounts if 'docker.sock' not in x]
        require(len(work_mount) == 1, 'Wrong number of mirror mounts provided, see documentation.')
    # If sample is given as relative path, assume it's in the work directory
    if not all(x.startswith('/') for x in args.samples):
        args.samples = [os.path.join(work_mount[0], x) for x in args.samples if not x.startswith('/')]
        log.info('\nSample given as relative path, assuming sample is in work directory: {}'.format(work_mount[0]))
    # Enforce file input standards
    require(all(x.startswith('/') for x in args.samples),
            "Sample inputs must point to a file's full path, "
            "e.g. '/full/path/to/sample1.tar'. You provided %s", str(args.samples))
    require(all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]),
            "Sample inputs must point to a file's full path, "
            "e.g. '/full/path/to/kallisto_hg38.idx'.")
    # Output log information
    log.info('The work mount is: {}'.format(work_mount[0]))
    log.info('Samples to run: {}'.format('\t'.join(args.samples)))
    log.info('Pipeline input locations: \n{}\n{}\n{}'.format(args.star, args.rsem, args.kallisto))
    call_pipeline(work_mount[0], args)
Beispiel #20
0
 # If no arguments provided, print full help menu
 if len(sys.argv) == 1:
     parser.print_help()
     sys.exit(1)
 # Get name of most recent running container. If socket is mounted, should be this one.
 try:
     name = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0]
 except subprocess.CalledProcessError as e:
     raise RuntimeError('No container detected, ensure Docker is being run with: '
                        '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'.format(e.message))
 # Get name of mounted volume
 blob = json.loads(subprocess.check_output(['docker', 'inspect', name]))
 mounts = blob[0]['Mounts']
 # Ensure docker.sock is mounted correctly
 sock_mount = [x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source']]
 require(len(sock_mount) == 1, 'Missing socket mount. Requires the following: '
                               'docker run -v /var/run/docker.sock:/var/run/docker.sock')
 work_mount = args.work_mount
 for samples in [args.sample_tar, args.sample_paired, args.sample_single]:
     if not samples:
         continue
     # If sample is given as relative path, assume it's in the work directory
     if not all(x.startswith('/') for x in samples):
         samples = [os.path.join(work_mount, x) for x in samples if not x.startswith('/')]
         log.info('\nSample given as relative path, assuming sample is in work directory: {}'.format(work_mount[0]))
     # Enforce file input standards
     require(all(x.startswith('/') for x in samples),
             "Sample inputs must point to a file's full path, "
             "e.g. '/full/path/to/sample1.tar'. You provided %s", str(samples))
     if samples == args.sample_tar:
         log.info('TARs to run: {}'.format('\t'.join(args.sample_tar)))
     if samples == args.sample_paired:
Beispiel #21
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Dockerized Toil RNA-seq pipeline

    RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto)

    General Usage:
    docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \
    quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar

    Please see the complete documentation located at:
    https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline
    or inside the container at: /opt/rnaseq-pipeline/README.md


    Structure of RNA-Seq Pipeline (per sample)

                  3 -- 4 -- 5
                 /          |
      0 -- 1 -- 2 ---- 6 -- 8
                 \          |
                  7 ---------

    0 = Download sample
    1 = Unpack/Merge fastqs
    2 = CutAdapt (adapter trimming)
    3 = STAR Alignment
    4 = RSEM Quantification
    5 = RSEM Post-processing
    6 = Kallisto
    7 = FastQC
    8 = Consoliate output and upload to S3
    =======================================
    Dependencies
    Docker
    """
    # Define argument parser for
    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--samples',
                        nargs='+',
                        required=True,
                        help='Absolute path(s) to sample tarballs.')
    parser.add_argument('--star',
                        type=str,
                        required=True,
                        help='Absolute path to STAR index tarball.')
    parser.add_argument('--rsem',
                        type=str,
                        required=True,
                        help='Absolute path to rsem reference tarball.')
    parser.add_argument('--kallisto',
                        type=str,
                        required=True,
                        help='Absolute path to kallisto index (.idx) file.')
    parser.add_argument(
        '--disable-cutadapt',
        action='store_true',
        default=False,
        help=
        'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.'
    )
    parser.add_argument(
        '--save-bam',
        action='store_true',
        default='false',
        help='If this flag is used, genome-aligned bam is written to output.')
    parser.add_argument(
        '--save-wiggle',
        action='store_true',
        default='false',
        help='If this flag is used, wiggle files (.bg) are written to output.')
    parser.add_argument(
        '--no-clean',
        action='store_true',
        help='If this flag is used, temporary work directory is not cleaned.')
    parser.add_argument(
        '--resume',
        type=str,
        default=None,
        help=
        'Pass the working directory that contains a job store to be resumed.')
    parser.add_argument(
        '--cores',
        type=int,
        default=None,
        help=
        'Will set a cap on number of cores to use, default is all available cores.'
    )
    args = parser.parse_args()
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Get name of most recent running container. If socket is mounted, should be this one.
    try:
        name = subprocess.check_output(
            ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0]
    except subprocess.CalledProcessError as e:
        raise RuntimeError(
            'No container detected, ensure Docker is being run with: '
            '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'
            .format(e.message))
    # Get name of mounted volume
    blob = json.loads(subprocess.check_output(['docker', 'inspect', name]))
    mounts = blob[0]['Mounts']
    # Ensure docker.sock is mounted correctly
    sock_mount = [
        x['Source'] == x['Destination'] for x in mounts
        if 'docker.sock' in x['Source']
    ]
    require(
        len(sock_mount) == 1, 'Missing socket mount. Requires the following: '
        'docker run -v /var/run/docker.sock:/var/run/docker.sock')
    '''
    # Ensure formatting of command for 2 mount points
    if len(mounts) == 2:
        require(all(x['Source'] == x['Destination'] for x in mounts),
                'Docker Src/Dst mount points, invoked with the -v argument, '
                'must be the same if only using one mount point aside from the docker socket.')
        work_mount = [x['Source'] for x in mounts if 'docker.sock' not in x['Source']]
    else:
        # Ensure only one mirror mount exists aside from docker.sock
        mirror_mounts = [x['Source'] for x in mounts if x['Source'] == x['Destination']]
        work_mount = [x for x in mirror_mounts if 'docker.sock' not in x]
        require(len(work_mount) == 1, 'Wrong number of mirror mounts provided, see documentation.')
    '''

    #    if "TMPDIR" in os.environ:
    #        log.info('Setting work mount to TMPDIR which is: {}'.format(os.environ['TMPDIR']))
    #        work_dir = os.environ['TMPDIR']
    #    else:
    #        log.info('TMPDIR not set; setting work mount to cwd which is: {}'.format(os.getcwd()))
    #        work_dir = os.getcwd()

    #    work_mount = list(os.getenv('TMPDIR', os.getcwd()))

    # workdir is the cwd so CWL can collect the output
    work_dir = os.getcwd()

    # If sample is given as relative path, assume it's in the work directory
    if not all(x.startswith('/') for x in args.samples):
        args.samples = [
            os.path.join(work_mount[0], x) for x in args.samples
            if not x.startswith('/')
        ]
        log.info(
            '\nSample given as relative path, assuming sample is in work directory: {}'
            .format(work_mount[0]))
    # Enforce file input standards
    require(
        all(x.startswith('/') for x in args.samples),
        "Sample inputs must point to a file's full path, "
        "e.g. '/full/path/to/sample1.tar'. You provided %s", str(args.samples))
    require(
        all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]),
        "Sample inputs must point to a file's full path, "
        "e.g. '/full/path/to/kallisto_hg38.idx'.")
    # Output log information
    log.info('The work mount is: {}'.format(work_dir))
    #    log.info('The work mount is: {}'.format(work_mount[0]))
    log.info('Samples to run: {}'.format('\t'.join(args.samples)))
    log.info('Pipeline input locations: \n{}\n{}\n{}'.format(
        args.star, args.rsem, args.kallisto))
    call_pipeline(work_dir, args)
Beispiel #22
0
 def resume(self):
     if not os.path.exists(self.jobStoreDir):
         raise NoSuchJobStoreException(self.jobStoreDir)
     require(os.path.isdir, "'%s' is not a directory", self.jobStoreDir)
     super(FileJobStore, self).resume()
Beispiel #23
0
def _docker(job,
            tool,
            parameters=None,
            workDir=None,
            dockerParameters=None,
            outfile=None,
            checkOutput=False,
            defer=None):
    """
    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools).
    :param list[str] parameters: Command line arguments to be passed to the tool.
           If list of lists: list[list[str]], then treat as successive commands chained with pipe.
    :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data
    :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`,
            `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention.
             These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired.
    :param file outfile: Pipe output of Docker call to file handle
    :param bool checkOutput: When True, this function returns docker's output.
    :param int defer: What action should be taken on the container upon job completion?
           FORGO (0) will leave the container untouched.
           STOP (1) will attempt to stop the container with `docker stop` (useful for debugging).
           RM (2) will stop the container and then forcefully remove it from the system
           using `docker rm -f`. This is the default behavior if defer is set to None.
    """
    if parameters is None:
        parameters = []
    if workDir is None:
        workDir = os.getcwd()

    # Setup the outgoing subprocess call for docker
    baseDockerCall = ['docker', 'run']
    if dockerParameters:
        baseDockerCall += dockerParameters
    else:
        baseDockerCall += ['--rm', '--log-driver', 'none', '-v',
                           os.path.abspath(workDir) + ':/data']

    # Ensure the user has passed a valid value for defer
    require(defer in (None, FORGO, STOP, RM),
            'Please provide a valid value for defer.')

    # Get container name which is needed for _dockerKill
    try:
        if any('--name' in x for x in baseDockerCall):
            if any('--name=' in x for x in baseDockerCall):
                containerName = [x.split('=')[1] for x in baseDockerCall if '--name' in x][0]
            else:
                containerName = baseDockerCall[baseDockerCall.index('--name') + 1]
        else:
            containerName = _getContainerName(job)
            baseDockerCall.extend(['--name', containerName])
    except ValueError:
        containerName = _getContainerName(job)
        baseDockerCall.extend(['--name', containerName])
    except IndexError:
        raise RuntimeError("Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters))

    # Defer the container on-exit action
    if '--rm' in baseDockerCall and defer is None:
        defer = RM
    if '--rm' in baseDockerCall and defer is not RM:
        _logger.warn('--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer))
    job.defer(_dockerKill, containerName, action=defer)
    # Defer the permission fixing function which will run after this job concludes.
    # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure.
    job.defer(_fixPermissions, tool, workDir)

    # Make subprocess call

    # If parameters is list of lists, treat each list as separate command and chain with pipes
    if len(parameters) > 0 and type(parameters[0]) is list:
        # When piping, all arguments now get merged into a single string to bash -c.
        # We try to support spaces in paths by wrapping them all in quotes first.
        chain_params = [' '.join(p) for p in [list(map(pipes.quote, q)) for q in parameters]]
        # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain
        call = baseDockerCall + ['--entrypoint', '/bin/bash',  tool, '-c',
                                 'set -eo pipefail && {}'.format(' | '.join(chain_params))]
    else:
        call = baseDockerCall + [tool] + parameters
    _logger.info("Calling docker with " + repr(call))

    params = {}
    if outfile:
        params['stdout'] = outfile
    if checkOutput:
        callMethod = subprocess.check_output
    else:
        callMethod = subprocess.check_call

    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            out = callMethod(call, **params)

    _fixPermissions(tool=tool, workDir=workDir)
    return out