Exemple #1
0
    def run(self, assets, parameters=tuple()):
        # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies,
        # there is a simple assert here to prevent cryptic failures if the software is not on the system.
        samtools = 'samtools'
        assert environment.Executable.ispresent(self._execpath)
        assert environment.Executable.ispresent(samtools)

        # directory for the index
        assert isinstance(assets, core.AssetsStep), "The parameter 'assets' must be an %s" % core.AssetsStep.__name__

        basename_index = assets.source.indexfilepattern.name

        # different strings for the command line
        gunzip = False
        if assets.source.read2 is None:
            # single reads
            cmd_sub = (assets.source.read1.name, )
            if assets.source.read1.iscompressed:
                gunzip = True
        else:
            cmd_sub = (assets.source.read1.name, assets.source.read2.name)
            if assets.source.read1.iscompressed:
                if assets.source.read2.iscompressed:
                    gunzip = True
                else:
                    raise ValueError('With GSNAP FASTQ can either be all gzip-compressed or all gzip-uncompressed.')
            else:
                # gunzip is already set to False
                pass

        # build command line
        cmd_align = [self._execpath,]
        cmd_align.extend(parameters)
        cmd_align.extend((
            '-d', 'reference', #FIXME: move to parameters + way to have default parameters clearly exposed to higher-level layers
            '-A', 'sam',
            '-D', basename_index))
        if gunzip:
            cmd_align.append('--gunzip')
        cmd_align.extend(cmd_sub)

        cmd_sam2bam = (samtools, 'view', '-bS', '-')

        cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [assets.target.alignment.name, ] 
        logger.debug(subprocess.list2cmdline(cmd))
        with open(os.devnull, "w") as fnull, \
             open(assets.target.alignment.name, "w") as fh_out, \
             core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align:
            # we are done yet: the output should be BAM, not SAM
            with core.Popen(cmd_sam2bam,
                            stdin=p_align.stdout,
                            stdout=fh_out,
                            stderr=fnull) as p_sam2bam:
                p_align.stdout.close()
                p_sam2bam.communicate()[0]
                returncode = p_sam2bam.returncode
        return (cmd, returncode)
Exemple #2
0
    def run(self, assets, parameters=tuple()):
        # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies,
        # there is a simple assert here to prevent cryptic failures if the software is not on the system.
        samtools = 'samtools'
        assert environment.Executable.ispresent(self._execpath)
        assert environment.Executable.ispresent(samtools)
        
        source = assets.source
        indexfiles = tuple(name for cls, name in source.indexfilepattern.iterlistfiles())
        if len(indexfiles) == 0:
            raise ValueError("No BWA index files in %s" % source.indexfilepattern)

        # different strings for the command line
        if source.read1 is None or not isinstance(source.read1, core.SavedEntityAbstract):
            raise ValueError("Incorrect value %s for read1" % source.read1)
        if source.read2 is None:
            # single reads
            cmd_sub = ('mem', (source.indexfilepattern.name, source.read1.name))
        else:
            cmd_sub = ('mem', (source.indexfilepattern.name, source.read1.name, source.read2.name))

        # Gzip-compression not supported by BWA 
        if source.read1.iscompressed or (source.read2 is not None and source.read2.iscompressed):
            raise NotImplementedError("BWA does not support gzip-compressed FASTQ file. On-the-fly might eventually be implemented in the future.")
        # build command line
        cmd_align = ['%s' % self._execpath]
        cmd_align.append(cmd_sub[0])
        cmd_align.extend(parameters)
        cmd_align.extend(cmd_sub[1])
        
        cmd_sam2bam = (samtools, 'view', '-bS', '-')

        cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [assets.target.alignment.name, ] 
        logger.debug(subprocess.list2cmdline(cmd))
        with open(os.devnull, "w") as fnull, \
             open(assets.target.alignment.name, "w") as fh_out, \
             core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align:
            # we are done yet: the output should be BAM, not SAM
            with core.Popen(cmd_sam2bam,
                            stdin=p_align.stdout,
                            stdout=fh_out,
                            stderr=fnull) as p_sam2bam:
                p_sam2bam.communicate()[0]
                returncode = p_sam2bam.returncode
        return (cmd, returncode)

        cmd.append(assets.target.alignment.name)
        with open(os.devnull, "w") as fnull:
            logger.debug(subprocess.list2cmdline(cmd))
            returncode = subprocess.call(cmd, stdout = fnull, stderr = fnull)
        # at this point, the output is in SAM while when want BAM
        return (cmd, returncode)
def _split_mergedpairs(merged_fn, read1_fn, read2_fn):
    # split
    # (inspired by https://gist.github.com/nathanhaigh/3521724)

    inconsistent = False
    if read1_fn.endswith('.gz'):
        if read2_fn.endswith('.gz'):
            pipe_zip = '| gzip '
        else:
            inconsistent = True
    elif read2_fn.endswith('.gz'):
        inconsistent = True
    else:
        pipe_zip = ''
    if inconsistent:
        # this is presumably a mistake. do not allow it
        raise ValueError('The output files can be either both with the extention .gz, or both without.')
        
    env = os.environ.copy()
    env['mergedreads'] = subprocess.list2cmdline((merged_fn,))
    env['read1'] = subprocess.list2cmdline((read1_fn, ))
    env['read2'] = subprocess.list2cmdline((read2_fn,))
    cmd_str = 'paste - - - - - - - - < $mergedreads | ' + \
              'tee >(cut -f 1-4 | tr "\\t" "\\n" %(pipe_zip)s > $read1) | cut -f 5-8 | tr "\\t" "\\n" %(pipe_zip)s > $read2' % locals()
    with core.Popen(('/bin/bash', '-c', 
                     cmd_str),
                    env=env) as proc:
        proc.communicate()
        if proc.returncode != 0:
            raise RuntimeError()
    def packageversion(self, name):
        """ Check whether an R package is installed and return the version number.
        If the package cannot be loaded (generally because it is not installed),
        an exception :class:`MissingSoftware` is raised.

        :param name: name of the package

        """
        code = """res <- suppressMessages(suppressWarnings(require("%s", quietly=TRUE))); cat(res)"""
        cmd = (self.path, '--slave', '--no-restore', '-e', code % name)

        rsays = subprocess.check_output(cmd).rstrip()
        if rsays == b'FALSE':
            raise MissingSoftware("The R package '%s' is not installed" % name)

        code = 'res <- suppressMessages(packageVersion("%s")); cat(as.character(res))'
        cmd = (self.path, '--slave', '-e', code % name)
        with open(os.devnull, "w") as fnull,\
             core.Popen(cmd, stdout=subprocess.PIPE,
                        stderr=fnull) as proc:
            version = proc.stdout.readline().rstrip()
            tmp = proc.stdout.read()

        logger.info('R package "%s" version is %s' % (name, version))
        return version
 def getversion(self):
     if self.__version is None:
         with core.Popen((self.path, '--version'), \
                         stdout=subprocess.PIPE) as proc:
             version = proc.stdout.readline()
             self.__version = re.sub(b'R version (.+)', b'\\1', version)
     logger.info('R version is %s' % self.__version)
     return self.__version
Exemple #6
0
 def version(self):
     if self._version is None:
         with open(os.devnull, "w") as fnull:
             cmd = [self._execpath]
             try:
                 logger.debug(subprocess.list2cmdline(cmd))
                 proc = core.Popen(cmd,
                                   stdout=fnull,
                                   stderr=subprocess.PIPE)
             except OSError as ose:
                 raise UnifexError("""Command: %s
                 %s""" % (' '.join(cmd), ose))
         row = next(proc.stderr)
         m = re.match(b'^.+ version ([^ ]+) .+$', row)
         self._version = m.groups()[0]
     return self._version
Exemple #7
0
def samtools_getversion(execpath):
    """ Return the version of 'samtools'. """

    cmd = [
        execpath,
    ]
    logging.debug(cmd)
    m = None
    with core.Popen(cmd, stderr=subprocess.PIPE) as proc:

        for row in proc.stderr:
            m = re.match(b'^Version: ([^ \n]+).*$', row)
            if m is not None:
                break
    if m is None:
        raise RuntimeError('Could not find the version number.')
    version = m.groups()[0]
    return version
Exemple #8
0
 def version(self):
     if self._version is None:
         cmd = [self._execpath]
         with open(os.devnull, "w") as fnull:
             try:
                 logger.debug(subprocess.list2cmdline(cmd))
                 proc = core.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=fnull)
             except OSError as ose:
                 raise UnifexError("""Command: %s
                 %s""" % (' '.join(cmd), ose))
         m = None
         for row in proc.stdout:
             m = re.match(b'^.+? version ([0-9-]+)\.$', row)
             if m is not None:
                 break
         if m is None:
             raise Exception('The version number could not be extracted.')
         self._version = m.groups()[0]
     return self._version
    def run(self,
            code,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE):
        """ R arbitrary code in a subprocess. 

        :param code: a string with R code

        Returns the return code for the child process.
        """
        cmd = (
            self.path,
            '--slave',
        )
        with open(os.devnull, "w") as fnull, \
             core.Popen(cmd, stdin=stdin,
                        stdout=stdout,
                        stderr=stderr) as proc:
            stdout, stderr = proc.communicate(input=code.encode('ascii'))
            if proc.returncode != 0:
                logger.warning(stderr)
            return proc.returncode
Exemple #10
0
    def run(self, assets, parameters=tuple()):
        # samtools is a dependency. In the absence of features in RRT that would handle secondary depencies,
        # there is a simple assert here to prevent cryptic failures if the software is not on the system.
        samtools = 'samtools'
        assert environment.Executable.ispresent(self._execpath)
        assert environment.Executable.ispresent(samtools)
        
        source = assets.source
        indexfiles = tuple(name for cls, name in source.indexfilepattern.iterlistfiles())
        if len(indexfiles) == 0:
            raise ValueError("No bowtie index files in %s" % source.indexfilepattern)

        # different strings for the command line
        if source.read1 is None or not isinstance(source.read1, core.SavedEntityAbstract):
            raise ValueError("Incorrect value %s for read1" % source.read1)

        if source.read2 is None:
            # single reads
            cmd_sub = (source.read1.name, )
        else:
            cmd_sub = ('-1', source.read1.name, '-2', source.read2.name)

        # Gzip-compression not supported by bowtie1
        if source.read1.iscompressed or (source.read2 is not None and source.read2.iscompressed):
            raise NotImplementedError("Bowtie(1) does not support gzip-compressed FASTQ file. On-the-fly might eventually be implemented in the future.")
        # build command line
        # notes:
        #    - q : FASTQ in input
        cmd_align = ['%s' % self._execpath]
        cmd_align.extend(parameters)
        # unaligned_fn = None
        # aligned_fn = None
        # for ext in sorted(assets.target.alignment._extension, key=len, reverse=True):
        #     if assets.target.alignment.name.endswith(ext):
        #         fn = assets.target.alignment.name
        #         fn = fn[:(len(fn)-len(ext))]
        #         unaligned_fn = fn + '_unaligned' + ext
        #         aligned_fn = fn + '_aligned' + ext
        #         break
        # if unaligned_fn is None:
        #     logger.error('Could not build a file name for unaligned reads.')
        #     #FIXME: abort here.

        # if source.read2 is None:
        #     if os.path.exists(unaligned_fn):
        #         logger.warn('File %s already existing.' % unaligned_fn)
        # else:
        #     # "ext" obtained from loop above
        #     for r_i in ('1', '2'):
        #         tmp_fn = re.sub('(.+)(\\%s)$' % ext, '\\1_%s\\2' % r_i, unaligned_fn)
        #         if os.path.exists(tmp_fn):
        #             logger.warn('File %s already existing.' % unaligned_fn)

        # cmd_align.extend(('--un', unaligned_fn,))
        cmd_align.append('--sam') # otherwise a SAM-like format is produced...
        cmd_align.extend([source.indexfilepattern.name, 
                          '-q'])
        cmd_align.extend(cmd_sub)

        cmd_sam2bam = (samtools, 'view', '-bS', '-')

        aligned_fn = assets.target.alignment.name
        cmd = cmd_align + ['|', ] + list(cmd_sam2bam) + ['>',] + [aligned_fn, ] 
        logger.debug(subprocess.list2cmdline(cmd))
        with open(os.devnull, "w") as fnull, \
             open(aligned_fn, "w") as fh_out, \
             core.Popen(cmd_align, stdout=subprocess.PIPE, stderr=fnull) as p_align:
            # we are done yet: the output should be BAM, not SAM
            with core.Popen(cmd_sam2bam,
                            stdin=p_align.stdout,
                            stdout=fh_out,
                            stderr=fnull) as p_sam2bam:
                p_align.stdout.close()
                p_sam2bam.communicate()[0]
                returncode = p_sam2bam.returncode

        # # now merge aligned and unaligned reads
        # cmd_mergebam = (samtools, 'merge', assets.target.alignment.name, aligned_fn, unaligned_fn)
        # try:
        #     subprocess.check_call(cmd_mergebam)
        # finally:
        #     os.unlink(aligned_fn)
        #     os.unlink(unaligned_fn)

        return (cmd, returncode)
    def run(self, assets, parameters=tuple()):
        # FIXME: shouldn't strandedness be a better part of the model ?
        source = assets.source
        sortedbam = source.alignedreads
        if not isinstance(source.alignedreads, BAMFileSortedByID):
            # htseq-count requires sorted entries
            warnings.warn(("The source asset '%s' should ideally be sorted by read IDs. " +\
                           "We are sorting the file; use explicitly a '%s' rather than a '%s' "+\
                           "for better performances, as well as for reproducibility issues "+\
                           "(the sorting will use whatever 'samtools` is first found in the PATH)") \
                          % ("alignedreads", BAMFileSortedByID.__name__, BAMFile.__name__))
            output_dir = os.path.dirname(assets.target.counts.name)
            # temp file name for the sorted output
            sortedbam_fh = tempfile.NamedTemporaryFile(dir=output_dir,
                                                       suffix=".bam",
                                                       delete=False)
            # (cleaning temp files handled by Python, except sortedsam)
            # -- sort
            sorter = SamtoolsSorterByID()
            sorter_assets = sorter.Assets(
                sorter.Assets.Source(source.alignedreads),
                sorter.Assets.Target(BAMFile(sortedbam_fh.name)))
            sorter.run(sorter_assets)
            # sanity check:
            if os.stat(sorter_assets.target.sortedbam.name).st_size == 0:
                warnings.warn('The sorted BAM file is empty.')
            sortedbam = sorter_assets.target.sortedbam
        else:
            sortedbam_fh = None

        # BAM to SAM
        cmd_bam2sam = ['samtools', 'view', sortedbam.name]

        # build command line
        cmd_count = [
            self._execpath,
        ]
        cmd_count.extend(parameters)
        cmd_count.extend(['-', source.annotationfile.name])
        cmd = cmd_bam2sam + [
            '|',
        ] + cmd_count

        logger.debug(subprocess.list2cmdline(cmd))
        with open(os.devnull, "w") as fnull, \
             open(assets.target.counts.name, 'w') as output_fh:
            csv_w = csv.writer(output_fh)
            # HTSeq-count does not use column names in its output, unfortunately,
            # so we correct that
            csv_w.writerow(['ID', 'count'])
            with core.Popen(cmd_bam2sam, stdout=subprocess.PIPE,
                            stderr=fnull) as p_bam2sam:
                with p_bam2sam.stdout, core.Popen(
                        cmd_count,
                        stdin=p_bam2sam.stdout,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE) as p_htseq:
                    p_bam2sam.stdout.close()
                    # read the output of HTSeq line-per-line
                    csv_r = csv.reader(p_htseq.stdout, delimiter='\t')
                    for row in csv_r:
                        csv_w.writerow(row)
                        p_htseq.stdout.flush()
                    stdout, stderr = p_htseq.communicate()
                if p_htseq.returncode != 0:
                    if sortedbam_fh is not None:
                        os.unlink(sortedbam_fh.name)
                    logger.error(subprocess.list2cmdline(cmd),
                                 extra={'stderr': stderr})
                    raise subprocess.CalledProcessError(
                        p_htseq.returncode, cmd, None)
        if sortedbam_fh is not None:
            os.unlink(sortedbam_fh.name)
        return (cmd, p_htseq.returncode)