def run(self, assets, parameters=tuple()): # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies, # there is a simple assert here to prevent cryptic failures if the software is not on the system. samtools = 'samtools' assert environment.Executable.ispresent(self._execpath) assert environment.Executable.ispresent(samtools) # directory for the index assert isinstance(assets, core.AssetsStep), "The parameter 'assets' must be an %s" % core.AssetsStep.__name__ basename_index = assets.source.indexfilepattern.name # different strings for the command line gunzip = False if assets.source.read2 is None: # single reads cmd_sub = (assets.source.read1.name, ) if assets.source.read1.iscompressed: gunzip = True else: cmd_sub = (assets.source.read1.name, assets.source.read2.name) if assets.source.read1.iscompressed: if assets.source.read2.iscompressed: gunzip = True else: raise ValueError('With GSNAP FASTQ can either be all gzip-compressed or all gzip-uncompressed.') else: # gunzip is already set to False pass # build command line cmd_align = [self._execpath,] cmd_align.extend(parameters) cmd_align.extend(( '-d', 'reference', #FIXME: move to parameters + way to have default parameters clearly exposed to higher-level layers '-A', 'sam', '-D', basename_index)) if gunzip: cmd_align.append('--gunzip') cmd_align.extend(cmd_sub) cmd_sam2bam = (samtools, 'view', '-bS', '-') cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [assets.target.alignment.name, ] logger.debug(subprocess.list2cmdline(cmd)) with open(os.devnull, "w") as fnull, \ open(assets.target.alignment.name, "w") as fh_out, \ core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align: # we are done yet: the output should be BAM, not SAM with core.Popen(cmd_sam2bam, stdin=p_align.stdout, stdout=fh_out, stderr=fnull) as p_sam2bam: p_align.stdout.close() p_sam2bam.communicate()[0] returncode = p_sam2bam.returncode return (cmd, returncode)
def run(self, assets, parameters=tuple()): # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies, # there is a simple assert here to prevent cryptic failures if the software is not on the system. samtools = 'samtools' assert environment.Executable.ispresent(self._execpath) assert environment.Executable.ispresent(samtools) source = assets.source indexfiles = tuple(name for cls, name in source.indexfilepattern.iterlistfiles()) if len(indexfiles) == 0: raise ValueError("No BWA index files in %s" % source.indexfilepattern) # different strings for the command line if source.read1 is None or not isinstance(source.read1, core.SavedEntityAbstract): raise ValueError("Incorrect value %s for read1" % source.read1) if source.read2 is None: # single reads cmd_sub = ('mem', (source.indexfilepattern.name, source.read1.name)) else: cmd_sub = ('mem', (source.indexfilepattern.name, source.read1.name, source.read2.name)) # Gzip-compression not supported by BWA if source.read1.iscompressed or (source.read2 is not None and source.read2.iscompressed): raise NotImplementedError("BWA does not support gzip-compressed FASTQ file. On-the-fly might eventually be implemented in the future.") # build command line cmd_align = ['%s' % self._execpath] cmd_align.append(cmd_sub[0]) cmd_align.extend(parameters) cmd_align.extend(cmd_sub[1]) cmd_sam2bam = (samtools, 'view', '-bS', '-') cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [assets.target.alignment.name, ] logger.debug(subprocess.list2cmdline(cmd)) with open(os.devnull, "w") as fnull, \ open(assets.target.alignment.name, "w") as fh_out, \ core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align: # we are done yet: the output should be BAM, not SAM with core.Popen(cmd_sam2bam, stdin=p_align.stdout, stdout=fh_out, stderr=fnull) as p_sam2bam: p_sam2bam.communicate()[0] returncode = p_sam2bam.returncode return (cmd, returncode) cmd.append(assets.target.alignment.name) with open(os.devnull, "w") as fnull: logger.debug(subprocess.list2cmdline(cmd)) returncode = subprocess.call(cmd, stdout = fnull, stderr = fnull) # at this point, the output is in SAM while when want BAM return (cmd, returncode)
def _split_mergedpairs(merged_fn, read1_fn, read2_fn): # split # (inspired by https://gist.github.com/nathanhaigh/3521724) inconsistent = False if read1_fn.endswith('.gz'): if read2_fn.endswith('.gz'): pipe_zip = '| gzip ' else: inconsistent = True elif read2_fn.endswith('.gz'): inconsistent = True else: pipe_zip = '' if inconsistent: # this is presumably a mistake. do not allow it raise ValueError('The output files can be either both with the extention .gz, or both without.') env = os.environ.copy() env['mergedreads'] = subprocess.list2cmdline((merged_fn,)) env['read1'] = subprocess.list2cmdline((read1_fn, )) env['read2'] = subprocess.list2cmdline((read2_fn,)) cmd_str = 'paste - - - - - - - - < $mergedreads | ' + \ 'tee >(cut -f 1-4 | tr "\\t" "\\n" %(pipe_zip)s > $read1) | cut -f 5-8 | tr "\\t" "\\n" %(pipe_zip)s > $read2' % locals() with core.Popen(('/bin/bash', '-c', cmd_str), env=env) as proc: proc.communicate() if proc.returncode != 0: raise RuntimeError()
def packageversion(self, name): """ Check whether an R package is installed and return the version number. If the package cannot be loaded (generally because it is not installed), an exception :class:`MissingSoftware` is raised. :param name: name of the package """ code = """res <- suppressMessages(suppressWarnings(require("%s", quietly=TRUE))); cat(res)""" cmd = (self.path, '--slave', '--no-restore', '-e', code % name) rsays = subprocess.check_output(cmd).rstrip() if rsays == b'FALSE': raise MissingSoftware("The R package '%s' is not installed" % name) code = 'res <- suppressMessages(packageVersion("%s")); cat(as.character(res))' cmd = (self.path, '--slave', '-e', code % name) with open(os.devnull, "w") as fnull,\ core.Popen(cmd, stdout=subprocess.PIPE, stderr=fnull) as proc: version = proc.stdout.readline().rstrip() tmp = proc.stdout.read() logger.info('R package "%s" version is %s' % (name, version)) return version
def getversion(self): if self.__version is None: with core.Popen((self.path, '--version'), \ stdout=subprocess.PIPE) as proc: version = proc.stdout.readline() self.__version = re.sub(b'R version (.+)', b'\\1', version) logger.info('R version is %s' % self.__version) return self.__version
def version(self): if self._version is None: with open(os.devnull, "w") as fnull: cmd = [self._execpath] try: logger.debug(subprocess.list2cmdline(cmd)) proc = core.Popen(cmd, stdout=fnull, stderr=subprocess.PIPE) except OSError as ose: raise UnifexError("""Command: %s %s""" % (' '.join(cmd), ose)) row = next(proc.stderr) m = re.match(b'^.+ version ([^ ]+) .+$', row) self._version = m.groups()[0] return self._version
def samtools_getversion(execpath): """ Return the version of 'samtools'. """ cmd = [ execpath, ] logging.debug(cmd) m = None with core.Popen(cmd, stderr=subprocess.PIPE) as proc: for row in proc.stderr: m = re.match(b'^Version: ([^ \n]+).*$', row) if m is not None: break if m is None: raise RuntimeError('Could not find the version number.') version = m.groups()[0] return version
def version(self): if self._version is None: cmd = [self._execpath] with open(os.devnull, "w") as fnull: try: logger.debug(subprocess.list2cmdline(cmd)) proc = core.Popen(cmd, stdout=subprocess.PIPE, stderr=fnull) except OSError as ose: raise UnifexError("""Command: %s %s""" % (' '.join(cmd), ose)) m = None for row in proc.stdout: m = re.match(b'^.+? version ([0-9-]+)\.$', row) if m is not None: break if m is None: raise Exception('The version number could not be extracted.') self._version = m.groups()[0] return self._version
def run(self, code, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE): """ R arbitrary code in a subprocess. :param code: a string with R code Returns the return code for the child process. """ cmd = ( self.path, '--slave', ) with open(os.devnull, "w") as fnull, \ core.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) as proc: stdout, stderr = proc.communicate(input=code.encode('ascii')) if proc.returncode != 0: logger.warning(stderr) return proc.returncode
def run(self, assets, parameters=tuple()): # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies, # there is a simple assert here to prevent cryptic failures if the software is not on the system. samtools = 'samtools' assert environment.Executable.ispresent(self._execpath) assert environment.Executable.ispresent(samtools) source = assets.source indexfiles = tuple(name for cls, name in source.indexfilepattern.iterlistfiles()) if len(indexfiles) == 0: raise ValueError("No bowtie index files in %s" % source.indexfilepattern) # different strings for the command line if source.read1 is None or not isinstance(source.read1, core.SavedEntityAbstract): raise ValueError("Incorrect value %s for read1" % source.read1) if source.read2 is None: # single reads cmd_sub = (source.read1.name, ) else: cmd_sub = ('-1', source.read1.name, '-2', source.read2.name) # Gzip-compression not supported by bowtie1 if source.read1.iscompressed or (source.read2 is not None and source.read2.iscompressed): raise NotImplementedError("Bowtie(1) does not support gzip-compressed FASTQ file. On-the-fly might eventually be implemented in the future.") # build command line # notes: # - q : FASTQ in input cmd_align = ['%s' % self._execpath] cmd_align.extend(parameters) # unaligned_fn = None # aligned_fn = None # for ext in sorted(assets.target.alignment._extension, key=len, reverse=True): # if assets.target.alignment.name.endswith(ext): # fn = assets.target.alignment.name # fn = fn[:(len(fn)-len(ext))] # unaligned_fn = fn + '_unaligned' + ext # aligned_fn = fn + '_aligned' + ext # break # if unaligned_fn is None: # logger.error('Could not build a file name for unaligned reads.') # #FIXME: abort here. # if source.read2 is None: # if os.path.exists(unaligned_fn): # logger.warn('File %s already existing.' % unaligned_fn) # else: # # "ext" obtained from loop above # for r_i in ('1', '2'): # tmp_fn = re.sub('(.+)(\\%s)$' % ext, '\\1_%s\\2' % r_i, unaligned_fn) # if os.path.exists(tmp_fn): # logger.warn('File %s already existing.' % unaligned_fn) # cmd_align.extend(('--un', unaligned_fn,)) cmd_align.append('--sam') # otherwise a SAM-like format is produced... cmd_align.extend([source.indexfilepattern.name, '-q']) cmd_align.extend(cmd_sub) cmd_sam2bam = (samtools, 'view', '-bS', '-') aligned_fn = assets.target.alignment.name cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [aligned_fn, ] logger.debug(subprocess.list2cmdline(cmd)) with open(os.devnull, "w") as fnull, \ open(aligned_fn, "w") as fh_out, \ core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align: # we are done yet: the output should be BAM, not SAM with core.Popen(cmd_sam2bam, stdin=p_align.stdout, stdout=fh_out, stderr=fnull) as p_sam2bam: p_align.stdout.close() p_sam2bam.communicate()[0] returncode = p_sam2bam.returncode # # now merge aligned and unaligned reads # cmd_mergebam = (samtools, 'merge', assets.target.alignment.name, aligned_fn, unaligned_fn) # try: # subprocess.check_call(cmd_mergebam) # finally: # os.unlink(aligned_fn) # os.unlink(unaligned_fn) return (cmd, returncode)
def run(self, assets, parameters=tuple()): # FIXME: shouldn't strandedness be a better part of the model ? source = assets.source sortedbam = source.alignedreads if not isinstance(source.alignedreads, BAMFileSortedByID): # htseq-count requires sorted entries warnings.warn(("The source asset '%s' should ideally be sorted by read IDs. " +\ "We are sorting the file; use explicitly a '%s' rather than a '%s' "+\ "for better performances, as well as for reproducibility issues "+\ "(the sorting will use whatever 'samtools` is first found in the PATH)") \ % ("alignedreads", BAMFileSortedByID.__name__, BAMFile.__name__)) output_dir = os.path.dirname(assets.target.counts.name) # temp file name for the sorted output sortedbam_fh = tempfile.NamedTemporaryFile(dir=output_dir, suffix=".bam", delete=False) # (cleaning temp files handled by Python, except sortedsam) # -- sort sorter = SamtoolsSorterByID() sorter_assets = sorter.Assets( sorter.Assets.Source(source.alignedreads), sorter.Assets.Target(BAMFile(sortedbam_fh.name))) sorter.run(sorter_assets) # sanity check: if os.stat(sorter_assets.target.sortedbam.name).st_size == 0: warnings.warn('The sorted BAM file is empty.') sortedbam = sorter_assets.target.sortedbam else: sortedbam_fh = None # BAM to SAM cmd_bam2sam = ['samtools', 'view', sortedbam.name] # build command line cmd_count = [ self._execpath, ] cmd_count.extend(parameters) cmd_count.extend(['-', source.annotationfile.name]) cmd = cmd_bam2sam + [ '|', ] + cmd_count logger.debug(subprocess.list2cmdline(cmd)) with open(os.devnull, "w") as fnull, \ open(assets.target.counts.name, 'w') as output_fh: csv_w = csv.writer(output_fh) # HTSeq-count does not use column names in its output, unfortunately, # so we correct that csv_w.writerow(['ID', 'count']) with core.Popen(cmd_bam2sam, stdout=subprocess.PIPE, stderr=fnull) as p_bam2sam: with p_bam2sam.stdout, core.Popen( cmd_count, stdin=p_bam2sam.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p_htseq: p_bam2sam.stdout.close() # read the output of HTSeq line-per-line csv_r = csv.reader(p_htseq.stdout, delimiter='\t') for row in csv_r: csv_w.writerow(row) p_htseq.stdout.flush() stdout, stderr = p_htseq.communicate() if p_htseq.returncode != 0: if sortedbam_fh is not None: os.unlink(sortedbam_fh.name) logger.error(subprocess.list2cmdline(cmd), extra={'stderr': stderr}) raise subprocess.CalledProcessError( p_htseq.returncode, cmd, None) if sortedbam_fh is not None: os.unlink(sortedbam_fh.name) return (cmd, p_htseq.returncode)