Esempio n. 1
0
class PipelineCommandWrapper(PipelineCommand):
    """
    Class for constructing program command lines

    This class is based on the PipelineCommand class but
    can be used directly (rather than needing to be
    subclassed).

    For example, to wrap the 'ls' command directly:

    >>> ls_command = PipelineCommandWrapper("List directory",'ls',dirn)

    It is also possible to extend the command line
    using the 'add_args' method, for example:

    >>> ls_command = PipelineCommandWrapper("List directory",'ls')
    >>> ls.command.add_args(dirn)
    """
    def __init__(self, name, *args):
        """
        Create a new PipelineCommandWrapper instance

        Arguments:
          name (str): arbitrary name for the command
          args  (List): initial list of arguments making
            up the command
        """
        PipelineCommand.__init__(self, *args)
        self._name = str(name)
        self._cmd = None
        if args:
            self._cmd = Command(*args)

    def add_args(self, *args):
        """
        Add additional arguments to extend the command being built

        Arguments:
          args  (List): one or more arguments to append to
            the command
        """
        if self._cmd is None:
            self._cmd = Command(*args)
        else:
            self._cmd.add_args(*args)

    def init(self, *args):
        """
        Internal: dummy init which does nothing
        """
        pass

    def cmd(self):
        """
        Internal: implement the 'cmd' method
        """
        return self._cmd
Esempio n. 2
0
    def add_args(self, *args):
        """
        Add additional arguments to extend the command being built

        Arguments:
          args  (List): one or more arguments to append to
            the command
        """
        if self._cmd is None:
            self._cmd = Command(*args)
        else:
            self._cmd.add_args(*args)
Esempio n. 3
0
 def info_func(p):
     name = os.path.basename(p)
     exe = find_program(p)
     version = ''
     output = Command(exe).subprocess_check_output()[1]
     for line in output.split('\n'):
         if line.startswith(name):
             try:
                 version = line.split()[1]
             except IndexError:
                 pass
             break
     return (exe,name.upper(),version)
Esempio n. 4
0
    def __init__(self, name, *args):
        """
        Create a new PipelineCommandWrapper instance

        Arguments:
          name (str): arbitrary name for the command
          args  (List): initial list of arguments making
            up the command
        """
        PipelineCommand.__init__(self, *args)
        self._name = str(name)
        self._cmd = None
        if args:
            self._cmd = Command(*args)
Esempio n. 5
0
def fastq_screen_tag(conf_file,
                     fastq_in,
                     out_dir,
                     aligner=None,
                     threads=1,
                     tempdir=None):
    """
    Run 'fastq_screen' and output tagged fastq file

    Raises an Exception in the event of an error.

    Arguments:
      conf_file (str): path to the fastq_screen .conf file
      fastq_in (str): path to the FASTQ file to screen
      out_dir (str): path to the output directory to put
        the tagged FASTQ in
      aligner (str): optional, name of the aligner to pass
        to fastq_screen (default: don't specify the aligner)
      threads (int): optional, the number of threads to
        use when running fastq_screen (default: 1)
      tempdir (str): optional, directory to create temporary
        working directories in when running fastq_screen

    Returns:
      String: path to the tagged output FASTQ file
    """
    # Make a temporary working directory
    work_dir = tempfile.mkdtemp(suffix='.fastq_screen', dir=tempdir)
    # Build fastq_screen command
    fastq_screen_cmd = Command('fastq_screen', '--subset', 0, '--threads',
                               threads, '--conf', conf_file, '--tag',
                               '--outdir', work_dir)
    if args.aligner is not None:
        fastq_screen_cmd.add_args('--aligner', args.aligner)
    fastq_screen_cmd.add_args(fastq_in)
    print "Running %s" % fastq_screen_cmd
    # Run the command
    exit_code = fastq_screen_cmd.run_subprocess(working_dir=work_dir)
    if exit_code != 0:
        err_msg = "Screening %s against %s failed (exit code %d)" % \
                  (fastq_in,conf_file,exit_code)
    else:
        # Handle the outputs
        tagged_fastq = os.path.basename(strip_ext(fastq_in,'.fastq')) \
                       + '.tagged.fastq'
        if not os.path.exists(os.path.join(work_dir, tagged_fastq)):
            err_msg = "Failed to generated tagged fastq file %s" % \
                      tagged_fastq
            exit_code = 1
        else:
            os.rename(os.path.join(work_dir, tagged_fastq),
                      os.path.join(out_dir, tagged_fastq))
    # Clean up working directory
    shutil.rmtree(work_dir)
    # Raise exception if there was a problem
    if exit_code != 0:
        raise Exception(err_msg)
    # Return path to tagged file
    return os.path.join(out_dir, tagged_fastq)
Esempio n. 6
0
def batch_fastqs(fastqs,batch_size,basename="batched",
                 out_dir=None):
    """
    Splits reads from one or more Fastqs into batches

    Concatenates input Fastq files and then splits
    reads into smaller Fastqs using the external 'batch'
    utility.

    Arguments:
      fastqs (list): list of paths to one or more Fastq
        files to take reads from
      batch_size (int): number of reads to allocate to
        each batch
      basename (str): optional basename to use for the
        output Fastq files (default: 'batched')
      out_dir (str): optional path to a directory where
        the batched Fastqs will be written
    """
    # Determine number of batches
    nreads = get_read_count(fastqs)
    nbatches = nreads/batch_size
    if nbatches*batch_size < nreads:
        nbatches += 1
    print "Creating %d batches of %d reads" % (nbatches,
                                               batch_size)
    assert(batch_size*nbatches >= nreads)

    # Check if fastqs are compressed
    gzipped = fastqs[0].endswith('.gz')
    if gzipped:
        batch_cmd = Command('zcat')
    else:
        batch_cmd = Command('cat')

    # Get the read number
    read_number = get_read_number(fastqs[0])
    suffix = ".r%s.fastq" % read_number

    # Build and run the batching command
    batch_cmd.add_args(*fastqs)
    batch_cmd.add_args('|',
                       'split',
                       '-l',batch_size*4,
                       '-d',
                       '-a',3,
                       '--additional-suffix=%s' % suffix,
                       '-',
                       os.path.join(out_dir,"%s.B" % basename))
    batch_script = os.path.join(out_dir,"batch.sh")
    batch_cmd.make_wrapper_script("/bin/bash",
                                  batch_script)

    # Check for successful exit code
    retcode = Command("/bin/bash",
                      batch_script).run_subprocess(
                          working_dir=out_dir)
    if retcode != 0:
        raise Exception("Batching failed: exit code %s" % retcode)
    print "Batching completed"

    # Collect and return the batched Fastq names
    batched_fastqs = [os.path.join(out_dir,
                                   "%s.B%03d%s"
                                   % (basename,i,suffix))
                      for i in xrange(0,nbatches)]
    return batched_fastqs
Esempio n. 7
0
    print "QC report: %s" % out_file

    # Run the QC
    announce("Running QC")
    max_jobs = __settings.general.max_concurrent_jobs
    sched = SimpleScheduler(runner=qc_runner,
                            max_concurrent=max_jobs)
    sched.start()
    for sample in samples:
        print "Checking/setting up for sample '%s'" % sample.name
        for fq in sample.fastq:
            if sample.verify_qc(qc_dir,fq):
                print "-- %s: QC ok" % fq
            else:
                print "-- %s: setting up QC" % fq
                qc_cmd = Command('illumina_qc.sh',fq)
                if args.nthreads > 1:
                    qc_cmd.add_args('--threads',args.nthreads)
                qc_cmd.add_args('--subset',args.fastq_screen_subset,
                                '--qc_dir',qc_dir)
                job = sched.submit(qc_cmd,
                                   wd=project.dirn,
                                   name="%s.%s" % (qc_base,
                                                   os.path.basename(fq)),
                                   log_dir=log_dir)
                print "Job: %s" % job
    # Wait for the scheduler to run all jobs
    sched.wait()
    sched.stop()

    # Verify the QC
 def cmd(self):
     return Command("echo", self._txt)
Esempio n. 9
0
class PipelineTask(object):
    """
    Base class defining a 'task' to run as part of a pipeline

    A 'task' wraps one or more external programs which can
    be run concurrently, and which produces a set of outputs.
    Individual programs should be wrapped in instances of the
    'PipelineCommand' class.

    This class should be subclassed to implement the 'init',
    'setup', 'finish' (optionally) and 'output' methods.

    The 'add_cmd' method can be used within 'setup' to add one
    or 'PipelineCommand' instances.

    """
    def __init__(self, _name, *args, **kws):
        """
        Create a new PipelineTask instance

        Arguments:
          name (str): an arbitrary user-friendly name for the
            task instance
          args (List): list of arguments to be supplied to
            the subclass (must match those defined in the
            'init' method)
          kws (Dictionary): dictionary of keyword-value pairs
            to be supplied to the subclass (must match those
            defined in the 'init' method)
        """
        self._name = str(_name)
        self._args = args
        self._kws = kws
        self._commands = []
        self._task_name = "%s.%s" % (sanitize_name(self._name), uuid.uuid4())
        self._completed = False
        self._stdout_files = []
        self._exit_code = 0
        # Working directory
        self._working_dir = None
        # Running jobs
        self._jobs = []
        self._groups = []
        # Deal with subclass arguments
        try:
            self._callargs = inspect.getcallargs(self.init, *args, **kws)
        except Exception as ex:
            logger.error("Exception setting up args for task '%s' (%s): %s" %
                         (self._name, self.__class__, ex))
            raise ex
        try:
            del (self._callargs['self'])
        except KeyError:
            pass
        # Execute the init method
        self.invoke(self.init, self._args, self._kws)

    @property
    def args(self):
        """
        Fetch parameters supplied to the instance
        """
        return AttributeDictionary(**self._callargs)

    @property
    def completed(self):
        """
        Check if the task has completed
        """
        return self._completed

    @property
    def exit_code(self):
        """
        Get the exit code for completed task

        Returns:
          Integer: exit code, or 'None' if task hasn't completed
        """
        if not self.completed:
            return None
        else:
            return self._exit_code

    @property
    def stdout(self):
        """
        Get the standard output from the task

        Returns:
          String: standard output from the task.
        """
        stdout = []
        for f in self._stdout_files:
            with open(f, 'r') as fp:
                stdout.append(fp.read())
        return ''.join(stdout)

    def name(self):
        """
        Get the name of the task within the pipeline

        Returns:
          String: a name consisting of a 'sanitized' version
            of the supplied name appended with a unique id
            code
        """
        return self._task_name

    def fail(self, exit_code=1, message=None):
        """
        Register the task as failing

        Intended to be invoked from the subclassed 'setup'
        or 'finish' methods, to terminate the task and
        indicate that it has failed.

        Arguments:
          exit_code (int): optional, specifies the exit code
            to return (defaults to 1)
          message (str): optional, error message to report to
            the pipeline user
        """
        if message:
            self.report("failed: %s" % message)
        self.report("failed: exit code set to %s" % exit_code)
        self._completed = True
        self._exit_code = exit_code

    def report(self, s):
        """
        Internal: report messages from the task
        """
        print "%s [Task: %s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S"),
                                    self._name, s)

    def invoke(self, f, args=None, kws=None):
        """
        Internal: invoke arbitrary method on the task

        Arguments:
          f (function): method to invoke (e.g. 'self.init')
          args (list): arguments to invoke function with
          kws (dictionary): keyworded parameters to invoke
            function with
        """
        # Switch to working directory, if defined
        if self._working_dir is not None:
            current_dir = os.getcwd()
            os.chdir(self._working_dir)
        # Invoke the requested method
        try:
            with Capturing() as output:
                if args is None:
                    f()
                else:
                    f(*args, **kws)
            self.report("done '%s'" % f.__name__)
            for line in output:
                self.report("%s STDOUT: %s" % (f.__name__, line))
        except NotImplementedError:
            pass
        except Exception as ex:
            self.report("exception invoking '%s': %s" % (f.__name__, ex))
            traceback.print_exc(ex)
            self._exit_code += 1
        # Switch back to original directory
        if self._working_dir is not None:
            os.chdir(current_dir)

    def task_completed(self, name, jobs, sched):
        """
        Internal: callback method

        This is a callback method which is invoked when
        scheduled jobs in the task finish

        Arguments:
          name (str): name for the callback
          jobs (list): list of SchedulerJob instances
          sched (SimpleScheduler): scheduler instance
        """
        for job in jobs:
            try:
                if job.exit_code != 0:
                    self._exit_code += 1
                self._stdout_files.append(job.log)
            except AttributeError:
                # Assume it's a group
                for j in job.jobs:
                    if j.exit_code != 0:
                        self._exit_code += 1
                    self._stdout_files.append(j.log)
        self.finish_task()

    def finish_task(self):
        """
        Internal: perform actions to finish the task
        """
        if self._exit_code != 0:
            logger.critical("%s failed: exit code %s" %
                            (self._name, self._exit_code))
        else:
            # Execute 'finish', if implemented
            self.invoke(self.finish)
        # Flag job as completed
        self._completed = True
        self.report("%s completed" % self._name)

    def add_cmd(self, pipeline_job):
        """
        Add a PipelineCommand to the task

        Arguments:
           pipeline_job (PipelineCommand): a PipelineCommand
             instance to be executed by the task when it
             runs
        """
        self._commands.append(pipeline_job)

    def run(self,
            sched=None,
            runner=None,
            working_dir=None,
            log_dir=None,
            scripts_dir=None,
            wait_for=(),
            async=True):
        """
        Run the task

        This method is not normally invoked directly; instead
        it's called by the pipeline that the task has been
        added to.

        Arguments:
          sched (SimpleScheduler): scheduler to submit jobs to
          runner (JobRunner): job runner to use when running
            jobs via the scheduler
          working_dir (str): path to the working directory to use
            (defaults to the current working directory)
          log_dir (str): path to the directory to write logs to
            (defaults to the working directory)
          scripts_dir (str): path to the directory to write
            scripts to (defaults to the working directory)
          wait_for (list): deprecated: list of scheduler jobs to
            wait for before running jobs from this task
          async (bool): if False then block until the task has
            completed
        """
        # Initialise
        if working_dir is None:
            working_dir = os.getcwd()
        self._working_dir = os.path.abspath(working_dir)
        if scripts_dir is None:
            scripts_dir = self._working_dir
        if log_dir is None:
            log_dir = self._working_dir
        # Do setup
        self.invoke(self.setup)
        # Generate commands to run
        cmds = []
        for command in self._commands:
            self.report("%s" % command.cmd())
            script_file = command.make_wrapper_script(scripts_dir=scripts_dir)
            cmd = Command('/bin/bash', script_file)
            self.report("wrapper script %s" % script_file)
            cmds.append(cmd)
        # Run the commands
        if cmds:
            use_group = (len(cmds) != 1)
            if use_group:
                # Run as a group
                group = sched.group(self.name())
                for j, cmd in enumerate(cmds):
                    name = "%s#%s" % (self.name(), j)
                    group.add(cmd,
                              wd=self._working_dir,
                              name=name,
                              runner=runner,
                              log_dir=log_dir,
                              wait_for=wait_for)
                group.close()
                callback_name = group.name
                callback_function = self.task_completed
                self._groups.append(group)
            else:
                # Run a single job
                cmd = cmds[0]
                name = self.name()
                job = sched.submit(cmd,
                                   wd=self._working_dir,
                                   name=name,
                                   runner=runner,
                                   log_dir=log_dir,
                                   wait_for=wait_for)
                callback_name = job.name
                callback_function = self.task_completed
                self._jobs.append(job)
            # Set up a callback which the scheduler will invoke
            # in background when the jobs complete
            sched.callback("%s" % self._name,
                           callback_function,
                           wait_for=(callback_name, ))
            if not async:
                # Wait for job or group to complete before returning
                while not self.completed:
                    time.sleep(5)
        else:
            # No commands to execute
            self.finish_task()
        return self