def fastq_screen_tag(conf_file, fastq_in, out_dir, aligner=None, threads=1, tempdir=None): """ Run 'fastq_screen' and output tagged fastq file Raises an Exception in the event of an error. Arguments: conf_file (str): path to the fastq_screen .conf file fastq_in (str): path to the FASTQ file to screen out_dir (str): path to the output directory to put the tagged FASTQ in aligner (str): optional, name of the aligner to pass to fastq_screen (default: don't specify the aligner) threads (int): optional, the number of threads to use when running fastq_screen (default: 1) tempdir (str): optional, directory to create temporary working directories in when running fastq_screen Returns: String: path to the tagged output FASTQ file """ # Make a temporary working directory work_dir = tempfile.mkdtemp(suffix='.fastq_screen', dir=tempdir) # Build fastq_screen command fastq_screen_cmd = Command('fastq_screen', '--subset', 0, '--threads', threads, '--conf', conf_file, '--tag', '--outdir', work_dir) if args.aligner is not None: fastq_screen_cmd.add_args('--aligner', args.aligner) fastq_screen_cmd.add_args(fastq_in) print "Running %s" % fastq_screen_cmd # Run the command exit_code = fastq_screen_cmd.run_subprocess(working_dir=work_dir) if exit_code != 0: err_msg = "Screening %s against %s failed (exit code %d)" % \ (fastq_in,conf_file,exit_code) else: # Handle the outputs tagged_fastq = os.path.basename(strip_ext(fastq_in,'.fastq')) \ + '.tagged.fastq' if not os.path.exists(os.path.join(work_dir, tagged_fastq)): err_msg = "Failed to generated tagged fastq file %s" % \ tagged_fastq exit_code = 1 else: os.rename(os.path.join(work_dir, tagged_fastq), os.path.join(out_dir, tagged_fastq)) # Clean up working directory shutil.rmtree(work_dir) # Raise exception if there was a problem if exit_code != 0: raise Exception(err_msg) # Return path to tagged file return os.path.join(out_dir, tagged_fastq)
class PipelineCommandWrapper(PipelineCommand): """ Class for constructing program command lines This class is based on the PipelineCommand class but can be used directly (rather than needing to be subclassed). For example, to wrap the 'ls' command directly: >>> ls_command = PipelineCommandWrapper("List directory",'ls',dirn) It is also possible to extend the command line using the 'add_args' method, for example: >>> ls_command = PipelineCommandWrapper("List directory",'ls') >>> ls.command.add_args(dirn) """ def __init__(self, name, *args): """ Create a new PipelineCommandWrapper instance Arguments: name (str): arbitrary name for the command args (List): initial list of arguments making up the command """ PipelineCommand.__init__(self, *args) self._name = str(name) self._cmd = None if args: self._cmd = Command(*args) def add_args(self, *args): """ Add additional arguments to extend the command being built Arguments: args (List): one or more arguments to append to the command """ if self._cmd is None: self._cmd = Command(*args) else: self._cmd.add_args(*args) def init(self, *args): """ Internal: dummy init which does nothing """ pass def cmd(self): """ Internal: implement the 'cmd' method """ return self._cmd
def batch_fastqs(fastqs,batch_size,basename="batched", out_dir=None): """ Splits reads from one or more Fastqs into batches Concatenates input Fastq files and then splits reads into smaller Fastqs using the external 'batch' utility. Arguments: fastqs (list): list of paths to one or more Fastq files to take reads from batch_size (int): number of reads to allocate to each batch basename (str): optional basename to use for the output Fastq files (default: 'batched') out_dir (str): optional path to a directory where the batched Fastqs will be written """ # Determine number of batches nreads = get_read_count(fastqs) nbatches = nreads/batch_size if nbatches*batch_size < nreads: nbatches += 1 print "Creating %d batches of %d reads" % (nbatches, batch_size) assert(batch_size*nbatches >= nreads) # Check if fastqs are compressed gzipped = fastqs[0].endswith('.gz') if gzipped: batch_cmd = Command('zcat') else: batch_cmd = Command('cat') # Get the read number read_number = get_read_number(fastqs[0]) suffix = ".r%s.fastq" % read_number # Build and run the batching command batch_cmd.add_args(*fastqs) batch_cmd.add_args('|', 'split', '-l',batch_size*4, '-d', '-a',3, '--additional-suffix=%s' % suffix, '-', os.path.join(out_dir,"%s.B" % basename)) batch_script = os.path.join(out_dir,"batch.sh") batch_cmd.make_wrapper_script("/bin/bash", batch_script) # Check for successful exit code retcode = Command("/bin/bash", batch_script).run_subprocess( working_dir=out_dir) if retcode != 0: raise Exception("Batching failed: exit code %s" % retcode) print "Batching completed" # Collect and return the batched Fastq names batched_fastqs = [os.path.join(out_dir, "%s.B%03d%s" % (basename,i,suffix)) for i in xrange(0,nbatches)] return batched_fastqs
# Run the QC announce("Running QC") max_jobs = __settings.general.max_concurrent_jobs sched = SimpleScheduler(runner=qc_runner, max_concurrent=max_jobs) sched.start() for sample in samples: print "Checking/setting up for sample '%s'" % sample.name for fq in sample.fastq: if sample.verify_qc(qc_dir,fq): print "-- %s: QC ok" % fq else: print "-- %s: setting up QC" % fq qc_cmd = Command('illumina_qc.sh',fq) if args.nthreads > 1: qc_cmd.add_args('--threads',args.nthreads) qc_cmd.add_args('--subset',args.fastq_screen_subset, '--qc_dir',qc_dir) job = sched.submit(qc_cmd, wd=project.dirn, name="%s.%s" % (qc_base, os.path.basename(fq)), log_dir=log_dir) print "Job: %s" % job # Wait for the scheduler to run all jobs sched.wait() sched.stop() # Verify the QC announce("Verifying QC") qc_ok = True