def batch_fastqs(fastqs,batch_size,basename="batched", out_dir=None): """ Splits reads from one or more Fastqs into batches Concatenates input Fastq files and then splits reads into smaller Fastqs using the external 'batch' utility. Arguments: fastqs (list): list of paths to one or more Fastq files to take reads from batch_size (int): number of reads to allocate to each batch basename (str): optional basename to use for the output Fastq files (default: 'batched') out_dir (str): optional path to a directory where the batched Fastqs will be written """ # Determine number of batches nreads = get_read_count(fastqs) nbatches = nreads/batch_size if nbatches*batch_size < nreads: nbatches += 1 print "Creating %d batches of %d reads" % (nbatches, batch_size) assert(batch_size*nbatches >= nreads) # Check if fastqs are compressed gzipped = fastqs[0].endswith('.gz') if gzipped: batch_cmd = Command('zcat') else: batch_cmd = Command('cat') # Get the read number read_number = get_read_number(fastqs[0]) suffix = ".r%s.fastq" % read_number # Build and run the batching command batch_cmd.add_args(*fastqs) batch_cmd.add_args('|', 'split', '-l',batch_size*4, '-d', '-a',3, '--additional-suffix=%s' % suffix, '-', os.path.join(out_dir,"%s.B" % basename)) batch_script = os.path.join(out_dir,"batch.sh") batch_cmd.make_wrapper_script("/bin/bash", batch_script) # Check for successful exit code retcode = Command("/bin/bash", batch_script).run_subprocess( working_dir=out_dir) if retcode != 0: raise Exception("Batching failed: exit code %s" % retcode) print "Batching completed" # Collect and return the batched Fastq names batched_fastqs = [os.path.join(out_dir, "%s.B%03d%s" % (basename,i,suffix)) for i in xrange(0,nbatches)] return batched_fastqs
def test_get_read_number_r2(self): """get_read_number: check read number for R2 Fastq file """ self.assertEqual(get_read_number(self.fastq_r2),2)