Example #1
0
def execute(statement, **kwargs):
    '''execute a statement locally.

    This method implements the same parameter interpolation
    as the function :func:`run`.

    Arguments
    ---------
    statement : string
        Command line statement to run.

    Returns
    -------
    stdout : string
        Data sent to standard output by command
    stderr : string
        Data sent to standard error by command
    '''

    if not kwargs:
        kwargs = getCallerLocals()

    kwargs = dict(list(PARAMS.items()) + list(kwargs.items()))

    E.info("running %s" % (statement % kwargs))

    if "cwd" not in kwargs:
        cwd = PARAMS["workingdir"]
    else:
        cwd = kwargs["cwd"]

    # cleaning up of statement
    # remove new lines and superfluous spaces and tabs
    statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip()
    if statement.endswith(";"):
        statement = statement[:-1]

    os.environ.update(
        {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')})
    process = subprocess.Popen(statement % kwargs,
                               cwd=cwd,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()

    if process.returncode != 0:
        raise OSError("Child was terminated by signal %i: \n"
                      "The stderr was: \n%s\n%s\n" %
                      (-process.returncode, stderr, statement))

    return stdout, stderr
Example #2
0
def execute(statement, **kwargs):
    '''execute a statement locally.

    This method implements the same parameter interpolation
    as the function :func:`run`.

    Arguments
    ---------
    statement : string
        Command line statement to run.

    Returns
    -------
    stdout : string
        Data sent to standard output by command
    stderr : string
        Data sent to standard error by command
    '''

    if not kwargs:
        kwargs = getCallerLocals()

    kwargs = dict(list(PARAMS.items()) + list(kwargs.items()))

    E.info("running %s" % (statement % kwargs))

    if "cwd" not in kwargs:
        cwd = PARAMS["workingdir"]
    else:
        cwd = kwargs["cwd"]

    # cleaning up of statement
    # remove new lines and superfluous spaces and tabs
    statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip()
    if statement.endswith(";"):
        statement = statement[:-1]

    os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
    process = subprocess.Popen(statement % kwargs,
                               cwd=cwd,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()

    if process.returncode != 0:
        raise OSError(
            "Child was terminated by signal %i: \n"
            "The stderr was: \n%s\n%s\n" %
            (-process.returncode, stderr, statement))

    return stdout, stderr
Example #3
0
def run(**kwargs):
    """run a command line statement.

    The method runs a single or multiple statements on the cluster
    using drmaa. The cluster is bypassed if:

        * ``to_cluster`` is set to None in the context of the
          calling function.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    To decide which statement to run, the method works by examining
    the context of the calling function for a variable called
    ``statement`` or ``statements``.

    If ``statements`` is defined, multiple job scripts are created and
    sent to the cluster. If ``statement`` is defined, a single job
    script is created and sent to the cluster. Additionally, if
    ``job_array`` is defined, the single statement will be submitted
    as an array job.

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``). Note that qrsh/qsub directly
          still works.

    """

    # combine options using correct preference
    options = dict(list(PARAMS.items()))
    options.update(list(getCallerLocals().items()))
    options.update(list(kwargs.items()))

    # insert legacy synonyms
    options['without_cluster'] = options.get('without_cluster')
    getParallelEnvironment(options)

    # enforce highest priority for cluster options in command-line
    if "cli_cluster_memory_default" in PARAMS:
        options["cluster_memory_default"] = PARAMS[
            "cli_cluster_memory_default"]
    if "cli_cluster_memory_resource" in PARAMS:
        options["cluster_memory_resource"] = PARAMS[
            "cli_cluster_memory_resource"]
    if "cli_cluster_num_jobs" in PARAMS:
        options["cluster_num_jobs"] = PARAMS["cli_cluster_num_jobs"]
    if "cli_cluster_options" in PARAMS:
        options["cluster_options"] = PARAMS["cli_cluster_options"]
    if "cli_cluster_parallel_environment" in PARAMS:
        options["cluster_parallel_environment"] = PARAMS[
            "cli_cluster_parallel_environment"]
    if "cli_cluster_priority" in PARAMS:
        options["cluster_priority"] = PARAMS["cli_cluster_priority"]
    if "cli_cluster_queue" in PARAMS:
        options["cluster_queue"] = PARAMS["cli_cluster_queue"]
    if "cli_cluster_queue_manager" in PARAMS:
        options["cluster_queue_manager"] = PARAMS["cli_cluster_queue_manager"]

    # if the command-line has not been used
    # get information from the legacy job_options
    if options["cluster_options"] == "":
        options["cluster_options"] = options.get("job_options",
                                                 options["cluster_options"])

    # get the memory requirement for the job
    job_memory = getJobMemory(options, PARAMS)

    # get the queue manager
    queue_manager = PARAMS["cluster_queue_manager"]

    shellfile = os.path.join(PARAMS["workingdir"], "shell.log")

    pid = os.getpid()
    E.debug('task: pid = %i' % pid)

    # connect to global session
    session = GLOBAL_SESSION
    E.debug('task: pid %i: sge session = %s' % (pid, str(session)))

    ignore_pipe_errors = options.get('ignore_pipe_errors', False)
    ignore_errors = options.get('ignore_errors', False)

    # run on cluster if:
    # * to_cluster is not defined or set to True
    # * command line option without_cluster is set to False
    # * an SGE session is present
    run_on_cluster = ("to_cluster" not in options or
                      options.get("to_cluster")) and \
        not options["without_cluster"] and \
        GLOBAL_SESSION is not None

    # SGE compatible job_name
    job_name = re.sub("[:]", "_",
                      os.path.basename(options.get("outfile", "ruffus")))

    def _writeJobScript(statement, job_memory, job_name, shellfile):
        # disabled - problems with quoting
        # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
        # (shellquote(statement), shellfile) )
        # module list outputs to stderr, so merge stderr and stdout

        script = '''#!/bin/bash -e \n
                    echo "%(job_name)s : START -> ${0}" >> %(shellfile)s
                    set | sed 's/^/%(job_name)s : /' &>> %(shellfile)s
                    set +o errexit
                    module list 2>&1 | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    set -o errexit
                    hostname | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    cat /proc/meminfo | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    echo "%(job_name)s : END -> ${0}" >> %(shellfile)s
                 ''' % locals()

        # restrict virtual memory
        # Note that there are resources in SGE which could do this directly
        # such as v_hmem.
        # Note that limiting resident set sizes (RSS) with ulimit is not
        # possible in newer kernels.
        script += "ulimit -v %i\n" % IOTools.human2bytes(job_memory)
        script += expandStatement(statement,
                                  ignore_pipe_errors=ignore_pipe_errors)
        script += "\n"

        job_path = getTempFilename(dir=PARAMS["workingdir"])

        with open(job_path, "w") as script_file:
            script_file.write(script)

        return (job_path)

    if run_on_cluster:
        # run multiple jobs
        if options.get("statements"):

            statement_list = []
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_ids, filenames = [], []

            for statement in statement_list:
                E.info("running statement:\n%s" % statement)

                job_path = _writeJobScript(statement, job_memory, job_name,
                                           shellfile)

                jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

                job_id = session.runJob(jt)

                job_ids.append(job_id)
                filenames.append((job_path, stdout_path, stderr_path))

                E.debug("job has been submitted with job_id %s" % str(job_id))

            E.debug("waiting for %i jobs to finish " % len(job_ids))

            session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER,
                                False)

            # collect and clean up
            for job_id, statement, paths in zip(job_ids, statement_list,
                                                filenames):
                job_path, stdout_path, stderr_path = paths
                collectSingleJobFromCluster(session,
                                            job_id,
                                            statement,
                                            stdout_path,
                                            stderr_path,
                                            job_path,
                                            ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)

        # run single job on cluster - this can be an array job
        else:

            statement = buildStatement(**options)
            E.info("running statement:\n%s" % statement)

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_path = _writeJobScript(statement, job_memory, job_name,
                                       shellfile)
            jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

            if "job_array" in options and options["job_array"] is not None:
                # run an array job
                start, end, increment = options.get("job_array")
                E.debug("starting an array job: %i-%i,%i" %
                        (start, end, increment))
                # sge works with 1-based, closed intervals
                job_ids = session.runBulkJobs(jt, start + 1, end, increment)
                E.debug("%i array jobs have been submitted as job_id %s" %
                        (len(job_ids), job_ids[0]))
                retval = session.synchronize(
                    job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True)

                stdout, stderr = getStdoutStderr(stdout_path, stderr_path)

            else:
                # run a single job
                job_id = session.runJob(jt)
                E.debug("job has been submitted with job_id %s" % str(job_id))

                collectSingleJobFromCluster(session,
                                            job_id,
                                            statement,
                                            stdout_path,
                                            stderr_path,
                                            job_path,
                                            ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)
    else:
        # run job locally on cluster
        statement_list = []
        if options.get("statements"):
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))
        else:
            statement_list.append(buildStatement(**options))

        if options.get("dryrun", False):
            return

        for statement in statement_list:
            E.info("running statement:\n%s" % statement)

            # process substitution <() and >() does not
            # work through subprocess directly. Thus,
            # the statement needs to be wrapped in
            # /bin/bash -c '...' in order for bash
            # to interpret the substitution correctly.
            if "<(" in statement or ">(" in statement:
                shell = os.environ.get('SHELL', "/bin/bash")
                if "bash" not in shell:
                    raise ValueError(
                        "require bash for advanced shell syntax: <()")
                # Note: pipes.quote is deprecated in Py3, use shlex.quote
                # (not present in Py2.7).
                statement = pipes.quote(statement)
                statement = "%s -c %s" % (shell, statement)

            process = subprocess.Popen(expandStatement(
                statement, ignore_pipe_errors=ignore_pipe_errors),
                                       cwd=PARAMS["workingdir"],
                                       shell=True,
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)

            # process.stdin.close()
            stdout, stderr = process.communicate()

            if process.returncode != 0 and not ignore_errors:
                raise OSError("---------------------------------------\n"
                              "Child was terminated by signal %i: \n"
                              "The stderr was: \n%s\n%s\n"
                              "-----------------------------------------" %
                              (-process.returncode, stderr, statement))
Example #4
0
def peekParameters(workingdir,
                   pipeline,
                   on_error_raise=None,
                   prefix=None,
                   update_interface=False,
                   restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = getCallerLocals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError(
                "can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError(
                "can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline

    os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") %
            (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Example #5
0
def getParameters(filenames=[
    "pipeline.ini",
],
                  defaults=None,
                  site_ini=True,
                  user_ini=True,
                  default_ini=True,
                  only_import=None):
    '''read a config file and return as a dictionary.

    Sections and keys are combined with an underscore. If a key
    without section does not exist, it will be added plain.

    For example::

       [general]
       input=input1.file

       [special]
       input=input2.file

    will be entered as { 'general_input' : "input1.file",
    'input: "input1.file", 'special_input' : "input2.file" }

    This function also updates the module-wide parameter map.

    The section [DEFAULT] is equivalent to [general].

    The order of initialization is as follows:

    1. hard-coded defaults
    2. pipeline specific default file in the CGAT code installation
    3. :file:`.cgat` in the users home directory
    4. files supplied by the user in the order given

    If the same configuration value appears in multiple
    files, later configuration files will overwrite the
    settings form earlier files.

    Path names are expanded to the absolute pathname to avoid
    ambiguity with relative path names. Path names are updated
    for parameters that end in the suffix "dir" and start with
    a "." such as "." or "../data".

    Arguments
    ---------
    filenames : list
       List of filenames of the configuration files to read.
    defaults : dict
       Dictionary with default values. These will be overwrite
       any hard-coded parameters, but will be overwritten by user
       specified parameters in the configuration files.
    default_ini : bool
       If set, the default initialization file will be read from
       'CGATPipelines/configuration/pipeline.ini'
    user_ini : bool
       If set, configuration files will also be read from a
       file called :file:`.cgat` in the user`s home directory.
    only_import : bool
       If set to a boolean, the parameter dictionary will be a
       defaultcollection. This is useful for pipelines that are
       imported (for example for documentation generation) but not
       executed as there might not be an appropriate .ini file
       available. If `only_import` is None, it will be set to the
       default, which is to raise an exception unless the calling
       script is imported or the option ``--is-test`` has been passed
       at the command line.

    Returns
    -------
    config : dict
       Dictionary with configuration values.
    '''

    global CONFIG
    global PARAMS
    old_id = id(PARAMS)

    caller_locals = getCallerLocals()

    # check if this is only for import
    if only_import is None:
        only_import = isTest() or \
            "__name__" not in caller_locals or \
            caller_locals["__name__"] != "__main__"

    # important: only update the PARAMS variable as
    # it is referenced in other modules. Thus the type
    # needs to be fixed at import. Raise error where this
    # is not the case.
    # Note: Parameter sharing in the Pipeline module needs
    # to be reorganized.
    if only_import:
        # turn on default dictionary
        TriggeredDefaultFactory.with_default = True

    # Clear up ini files on the list that do not exist.
    # Please note the use of list(filenames) to create
    # a clone to iterate over as we remove items from
    # the original list (to avoid unexpected results)
    for fn in list(filenames):
        if not os.path.exists(fn):
            filenames.remove(fn)

    if site_ini:
        # read configuration from /etc/cgat/pipeline.ini
        fn = "/etc/cgat/pipeline.ini"
        if os.path.exists(fn):
            filenames.insert(0, fn)

    if default_ini:
        # The link between CGATPipelines and Pipeline.py
        # needs to severed at one point.
        # 1. config files into CGAT module directory?
        # 2. Pipeline.py into CGATPipelines module directory?
        filenames.insert(
            0,
            os.path.join(CGATPIPELINES_PIPELINE_DIR, 'configuration',
                         'pipeline.ini'))

    if user_ini:
        # read configuration from a users home directory
        fn = os.path.join(os.path.expanduser("~"), ".cgat")
        if os.path.exists(fn):
            if 'pipeline.ini' in filenames:
                index = filenames.index('pipeline.ini')
                filenames.insert(index, fn)
            else:
                filenames.append(fn)

    # IMS: Several legacy scripts call this with a string as input
    # rather than a list. Check for this and correct

    if isinstance(filenames, str):
        filenames = [filenames]

    PARAMS['pipeline_ini'] = filenames

    try:
        CONFIG.read(filenames)
        p = configToDictionary(CONFIG)
    except configparser.InterpolationSyntaxError as ex:
        # Do not log, as called before logging module is initialized -
        # this will mess up loging configuration in Control.py and Experiment.py
        # E.debug(
        #     "InterpolationSyntaxError when reading configuration file, "
        #     "likely due to use of '%'. "
        #     "Please quote '%' if ini interpolation is required. "
        #     "Orginal error: {}".format(str(ex)))
        CONFIG = configparser.RawConfigParser()
        CONFIG.read(filenames)
        p = configToDictionary(CONFIG)

    # update with hard-coded PARAMS
    PARAMS.update(HARDCODED_PARAMS)

    if defaults:
        PARAMS.update(defaults)
    PARAMS.update(p)

    # interpolate some params with other parameters
    for param in INTERPOLATE_PARAMS:
        try:
            PARAMS[param] = PARAMS[param] % PARAMS
        except TypeError as msg:
            raise TypeError('could not interpolate %s: %s' %
                            (PARAMS[param], msg))

    # expand pathnames
    for param, value in list(PARAMS.items()):
        if param.endswith("dir"):
            if value.startswith("."):
                PARAMS[param] = os.path.abspath(value)

    # make sure that the dictionary reference has not changed
    assert id(PARAMS) == old_id

    return PARAMS
Example #6
0
def getParameters(filenames=["pipeline.ini", ],
                  defaults=None,
                  site_ini=True,
                  user_ini=True,
                  default_ini=True,
                  only_import=None):
    '''read a config file and return as a dictionary.

    Sections and keys are combined with an underscore. If a key
    without section does not exist, it will be added plain.

    For example::

       [general]
       input=input1.file

       [special]
       input=input2.file

    will be entered as { 'general_input' : "input1.file",
    'input: "input1.file", 'special_input' : "input2.file" }

    This function also updates the module-wide parameter map.

    The section [DEFAULT] is equivalent to [general].

    The order of initialization is as follows:

    1. hard-coded defaults
    2. pipeline specific default file in the CGAT code installation
    3. :file:`.cgat` in the users home directory
    4. files supplied by the user in the order given

    If the same configuration value appears in multiple
    files, later configuration files will overwrite the
    settings form earlier files.

    Path names are expanded to the absolute pathname to avoid
    ambiguity with relative path names. Path names are updated
    for parameters that end in the suffix "dir" and start with
    a "." such as "." or "../data".

    Arguments
    ---------
    filenames : list
       List of filenames of the configuration files to read.
    defaults : dict
       Dictionary with default values. These will be overwrite
       any hard-coded parameters, but will be overwritten by user
       specified parameters in the configuration files.
    default_ini : bool
       If set, the default initialization file will be read from
       'CGATPipelines/configuration/pipeline.ini'
    user_ini : bool
       If set, configuration files will also be read from a
       file called :file:`.cgat` in the user`s home directory.
    only_import : bool
       If set to a boolean, the parameter dictionary will be a
       defaultcollection. This is useful for pipelines that are
       imported (for example for documentation generation) but not
       executed as there might not be an appropriate .ini file
       available. If `only_import` is None, it will be set to the
       default, which is to raise an exception unless the calling
       script is imported or the option ``--is-test`` has been passed
       at the command line.

    Returns
    -------
    config : dict
       Dictionary with configuration values.
    '''

    global CONFIG
    global PARAMS
    old_id = id(PARAMS)

    caller_locals = getCallerLocals()

    # check if this is only for import
    if only_import is None:
        only_import = isTest() or \
            "__name__" not in caller_locals or \
            caller_locals["__name__"] != "__main__"

    # important: only update the PARAMS variable as
    # it is referenced in other modules. Thus the type
    # needs to be fixed at import. Raise error where this
    # is not the case.
    # Note: Parameter sharing in the Pipeline module needs
    # to be reorganized.
    if only_import:
        # turn on default dictionary
        TriggeredDefaultFactory.with_default = True

    # Clear up ini files on the list that do not exist.
    # Please note the use of list(filenames) to create
    # a clone to iterate over as we remove items from
    # the original list (to avoid unexpected results)
    for fn in list(filenames):
        if not os.path.exists(fn):
            filenames.remove(fn)

    if site_ini:
        # read configuration from /etc/cgat/pipeline.ini
        fn = "/etc/cgat/pipeline.ini"
        if os.path.exists(fn):
            filenames.insert(0, fn)

    if default_ini:
        # The link between CGATPipelines and Pipeline.py
        # needs to severed at one point.
        # 1. config files into CGAT module directory?
        # 2. Pipeline.py into CGATPipelines module directory?
        filenames.insert(0,
                         os.path.join(CGATPIPELINES_PIPELINE_DIR,
                                      'configuration',
                                      'pipeline.ini'))

    if user_ini:
        # read configuration from a users home directory
        fn = os.path.join(os.path.expanduser("~"),
                          ".cgat")
        if os.path.exists(fn):
            if 'pipeline.ini' in filenames:
                index = filenames.index('pipeline.ini')
                filenames.insert(index,fn)
            else:
                filenames.append(fn)

    # IMS: Several legacy scripts call this with a string as input
    # rather than a list. Check for this and correct

    if isinstance(filenames, str):
        filenames = [filenames]

    PARAMS['pipeline_ini'] = filenames

    try:
        CONFIG.read(filenames)
        p = configToDictionary(CONFIG)
    except configparser.InterpolationSyntaxError as ex:
        # Do not log, as called before logging module is initialized -
        # this will mess up loging configuration in Control.py and Experiment.py
        # E.debug(
        #     "InterpolationSyntaxError when reading configuration file, "
        #     "likely due to use of '%'. "
        #     "Please quote '%' if ini interpolation is required. "
        #     "Orginal error: {}".format(str(ex)))
        CONFIG = configparser.RawConfigParser()
        CONFIG.read(filenames)
        p = configToDictionary(CONFIG)
    
    # update with hard-coded PARAMS
    PARAMS.update(HARDCODED_PARAMS)

    if defaults:
        PARAMS.update(defaults)
    PARAMS.update(p)

    # interpolate some params with other parameters
    for param in INTERPOLATE_PARAMS:
        try:
            PARAMS[param] = PARAMS[param] % PARAMS
        except TypeError as msg:
            raise TypeError('could not interpolate %s: %s' %
                            (PARAMS[param], msg))

    # expand pathnames
    for param, value in list(PARAMS.items()):
        if param.endswith("dir"):
            if value.startswith("."):
                PARAMS[param] = os.path.abspath(value)

    # make sure that the dictionary reference has not changed
    assert id(PARAMS) == old_id

    return PARAMS
Example #7
0
def getParameters(filenames=["pipeline.ini", ],
                  defaults=None,
                  user_ini=True,
                  default_ini=True,
                  only_import=None):
    '''read a config file and return as a dictionary.

    Sections and keys are combined with an underscore. If a key
    without section does not exist, it will be added plain.

    For example::

       [general]
       input=input1.file

       [special]
       input=input2.file

    will be entered as { 'general_input' : "input1.file",
    'input: "input1.file", 'special_input' : "input2.file" }

    This function also updates the module-wide parameter map.

    The section [DEFAULT] is equivalent to [general].

    The order of initialization is as follows:

    1. hard-coded defaults
    2. pipeline specific default file in the CGAT code installation
    3. :file:`.cgat` in the users home directory
    4. files supplied by the user in the order given

    If the same configuration value appears in multiple
    files, later configuration files will overwrite the
    settings form earlier files.

    Path names are expanded to the absolute pathname to avoid
    ambiguity with relative path names. Path names are updated
    for parameters that end in the suffix "dir" and start with
    a "." such as "." or "../data".

    Arguments
    ---------
    filenames : list
       List of filenames of the configuration files to read.
    defaults : dict
       Dictionary with default values. These will be overwrite
       any hard-coded parameters, but will be overwritten by user
       specified parameters in the configuration files.
    default_ini : bool
       If set, the default initialization file will be read from
       'CGATPipelines/configuration/pipeline.ini'
    user_ini : bool
       If set, configuration files will also be read from a
       file called :file:`.cgat` in the user`s home directory.
    only_import : bool
       If set to a boolean, the parameter dictionary will be a
       defaultcollection. This is useful for pipelines that are
       imported (for example for documentation generation) but not
       executed as there might not be an appropriate .ini file
       available. If `only_import` is None, it will be set to the
       default, which is to raise an exception unless the calling
       script is imported or the option ``--is-test`` has been passed
       at the command line.

    Returns
    -------
    config : dict
       Dictionary with configuration values.
    '''

    global CONFIG
    global PARAMS
    old_id = id(PARAMS)

    caller_locals = getCallerLocals()

    # check if this is only for import
    if only_import is None:
        only_import = isTest() or \
            "__name__" not in caller_locals or \
            caller_locals["__name__"] != "__main__"

    # important: only update the PARAMS variable as
    # it is referenced in other modules. Thus the type
    # needs to be fixed at import. Raise error where this
    # is not the case.
    # Note: Parameter sharing in the Pipeline module needs
    # to be reorganized.
    if only_import:
        # turn on default dictionary
        TriggeredDefaultFactory.with_default = True

    if user_ini:
        # read configuration from a users home directory
        fn = os.path.join(os.path.expanduser("~"),
                          ".cgat")
        if os.path.exists(fn):
            filenames.insert(0, fn)

    # IMS: Several legacy scripts call this with a sting as input
    # rather than a list. Check for this and correct

    if isinstance(filenames, basestring):
        filenames = [filenames]

    if default_ini:
        # The link between CGATPipelines and Pipeline.py
        # needs to severed at one point.
        # 1. config files into CGAT module directory?
        # 2. Pipeline.py into CGATPipelines module directory?
        filenames.insert(0,
                         os.path.join(CGATPIPELINES_PIPELINE_DIR,
                                      'configuration',
                                      'pipeline.ini'))

    CONFIG.read(filenames)

    p = configToDictionary(CONFIG)

    # update with hard-coded PARAMS
    PARAMS.update(HARDCODED_PARAMS)

    if defaults:
        PARAMS.update(defaults)
    PARAMS.update(p)

    # interpolate some params with other parameters
    for param in INTERPOLATE_PARAMS:
        try:
            PARAMS[param] = PARAMS[param] % PARAMS
        except TypeError, msg:
            raise TypeError('could not interpolate %s: %s' %
                            (PARAMS[param], msg))
Example #8
0
def run(**kwargs):
    """run a command line statement.

    The method runs a single or multiple statements on the cluster
    using drmaa. The cluster is bypassed if:

        * ``to_cluster`` is set to None in the context of the
          calling function.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    To decide which statement to run, the method works by examining
    the context of the calling function for a variable called
    ``statement`` or ``statements``.

    If ``statements`` is defined, multiple job scripts are created and
    sent to the cluster. If ``statement`` is defined, a single job
    script is created and sent to the cluster. Additionally, if
    ``job_array`` is defined, the single statement will be submitted
    as an array job.

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``. Note that qrsh/qsub directly
          still works.

    """

    # combine options using correct preference
    options = dict(PARAMS.items())
    options.update(getCallerLocals().items())
    options.update(kwargs.items())

    # insert a few legacy synonyms
    options['cluster_options'] = options.get('job_options',
                                             options['cluster_options'])
    options['cluster_queue'] = options.get('job_queue',
                                           options['cluster_queue'])
    options['without_cluster'] = options.get('without_cluster')

    # get the memory requirement for the job
    job_memory = getJobMemory(options, PARAMS)

    # get the queue manager
    queue_manager = PARAMS["cluster_queue_manager"]

    shellfile = os.path.join(PARAMS["workingdir"], "shell.log")

    pid = os.getpid()
    E.debug('task: pid = %i' % pid)

    # connect to global session
    session = GLOBAL_SESSION
    E.debug('task: pid %i: sge session = %s' % (pid, str(session)))

    ignore_pipe_errors = options.get('ignore_pipe_errors', False)
    ignore_errors = options.get('ignore_errors', False)

    # run on cluster if:
    # * to_cluster is not defined or set to True
    # * command line option without_cluster is set to False
    # * an SGE session is present
    run_on_cluster = ("to_cluster" not in options or
                      options.get("to_cluster")) and \
        not options["without_cluster"] and \
        GLOBAL_SESSION is not None

    # SGE compatible job_name
    job_name = re.sub(
        "[:]", "_",
        os.path.basename(options.get("outfile", "ruffus")))

    def _writeJobScript(statement, job_memory, job_name, shellfile):
        # disabled - problems with quoting
        # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
        # (shellquote(statement), shellfile) )
        # module list outputs to stderr, so merge stderr and stdout

        script = '''#!/bin/bash\n
                    echo "%(job_name)s : START -> ${0}" >> %(shellfile)s
                    set | sed 's/^/%(job_name)s : /' &>> %(shellfile)s
                    module list 2>&1 | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    hostname | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    cat /proc/meminfo | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    echo "%(job_name)s : END -> ${0}" >> %(shellfile)s
                 ''' % locals()

        # restrict virtual memory
        # Note that there are resources in SGE which could do this directly
        # such as v_hmem.
        # Note that limiting resident set sizes (RSS) with ulimit is not
        # possible in newer kernels.
        script += "ulimit -v %i\n" % IOTools.human2bytes(job_memory)
        script += expandStatement(statement,
                                  ignore_pipe_errors=ignore_pipe_errors)
        script += "\n"

        job_path = getTempFilename(dir=PARAMS["workingdir"])

        with open(job_path, "w") as script_file:
            script_file.write(script)

        return(job_path)

    if run_on_cluster:
        # run multiple jobs
        if options.get("statements"):

            statement_list = []
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_ids, filenames = [], []

            for statement in statement_list:
                E.debug("running statement:\n%s" % statement)

                job_path = _writeJobScript(statement, job_memory, job_name, shellfile)

                jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

                job_id = session.runJob(jt)

                job_ids.append(job_id)
                filenames.append((job_path, stdout_path, stderr_path))

                E.debug("job has been submitted with job_id %s" % str(job_id))

            E.debug("waiting for %i jobs to finish " % len(job_ids))

            session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER,
                                False)

            # collect and clean up
            for job_id, statement, paths in zip(job_ids, statement_list,
                                                filenames):
                job_path, stdout_path, stderr_path = paths
                collectSingleJobFromCluster(session, job_id,
                                             statement,
                                             stdout_path,
                                             stderr_path,
                                             job_path,
                                             ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)

        # run single job on cluster - this can be an array job
        else:

            statement = buildStatement(**options)
            E.debug("running statement:\n%s" % statement)

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_path = _writeJobScript(statement, job_memory, job_name, shellfile)
            jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

            if "job_array" in options and options["job_array"] is not None:
                # run an array job
                start, end, increment = options.get("job_array")
                E.debug("starting an array job: %i-%i,%i" %
                        (start, end, increment))
                # sge works with 1-based, closed intervals
                job_ids = session.runBulkJobs(jt, start + 1, end, increment)
                E.debug("%i array jobs have been submitted as job_id %s" %
                        (len(job_ids), job_ids[0]))
                retval = session.synchronize(
                    job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True)

                stdout, stderr = getStdoutStderr(stdout_path, stderr_path)

            else:
                # run a single job
                job_id = session.runJob(jt)
                E.debug("job has been submitted with job_id %s" % str(job_id))

                collectSingleJobFromCluster(session, job_id,
                                             statement,
                                             stdout_path,
                                             stderr_path,
                                             job_path,
                                             ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)
    else:
        # run job locally on cluster
        statement_list = []
        if options.get("statements"):
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))
        else:
            statement_list.append(buildStatement(**options))

        if options.get("dryrun", False):
            return

        for statement in statement_list:
            E.debug("running statement:\n%s" % statement)

            # process substitution <() and >() does not
            # work through subprocess directly. Thus,
            # the statement needs to be wrapped in
            # /bin/bash -c '...' in order for bash
            # to interpret the substitution correctly.
            if "<(" in statement or ">(" in statement:
                shell = os.environ.get('SHELL', "/bin/bash")
                if "bash" not in shell:
                    raise ValueError(
                        "require bash for advanced shell syntax: <()")
                # Note: pipes.quote is deprecated in Py3, use shlex.quote
                # (not present in Py2.7).
                statement = pipes.quote(statement)
                statement = "%s -c %s" % (shell, statement)

            process = subprocess.Popen(
                expandStatement(
                    statement,
                    ignore_pipe_errors=ignore_pipe_errors),
                cwd=PARAMS["workingdir"],
                shell=True,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)

            # process.stdin.close()
            stdout, stderr = process.communicate()

            if process.returncode != 0 and not ignore_errors:
                raise OSError(
                    "---------------------------------------\n"
                    "Child was terminated by signal %i: \n"
                    "The stderr was: \n%s\n%s\n"
                    "-----------------------------------------" %
                    (-process.returncode, stderr, statement))
Example #9
0
def peekParameters(workingdir,
                   pipeline,
                   on_error_raise=None,
                   prefix=None,
                   update_interface=False,
                   restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = getCallerLocals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError("can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError("can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") % (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Example #10
0
def run(**kwargs):
    """run a command line statement.

    The method runs a single or multiple statements on the cluster
    using drmaa. The cluster is bypassed if:

        * ``to_cluster`` is set to None in the context of the
          calling function.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    To decide which statement to run, the method works by examining
    the context of the calling function for a variable called
    ``statement`` or ``statements``.

    If ``statements`` is defined, multiple job scripts are created and
    sent to the cluster. If ``statement`` is defined, a single job
    script is created and sent to the cluster. Additionally, if
    ``job_array`` is defined, the single statement will be submitted
    as an array job.

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``. Note that qrsh/qsub directly
          still works.

    """

    # combine options using correct preference
    options = dict(PARAMS.items())
    options.update(getCallerLocals().items())
    options.update(kwargs.items())

    # insert a few legacy synonyms
    options['cluster_options'] = options.get('job_options',
                                             options['cluster_options'])
    options['cluster_queue'] = options.get('job_queue',
                                           options['cluster_queue'])
    options['without_cluster'] = options.get('without_cluster')

    job_memory = None

    if 'job_memory' in options:
        job_memory = options['job_memory']

    elif "mem_free" in options["cluster_options"] and \
         PARAMS.get("cluster_memory_resource", False):

        E.warn("use of mem_free in job options is deprecated, please"
               " set job_memory local var instead")

        o = options["cluster_options"]
        x = re.search("-l\s*mem_free\s*=\s*(\S+)", o)
        if x is None:
            raise ValueError(
                "expecting mem_free in '%s'" % o)

        job_memory = x.groups()[0]

        # remove memory spec from job options
        options["cluster_options"] = re.sub(
            "-l\s*mem_free\s*=\s*(\S+)", "", o)
    else:
        job_memory = PARAMS.get("cluster_memory_default", "2G")

    def setupJob(session, options, job_memory, job_name):

        jt = session.createJobTemplate()
        jt.workingDirectory = PARAMS["workingdir"]
        jt.jobEnvironment = {'BASH_ENV': '~/.bashrc'}
        jt.args = []
        if not re.match("[a-zA-Z]", job_name[0]):
            job_name = "_" + job_name

        spec = [
            "-V",
            "-p %(cluster_priority)i",
            "-N %s" % job_name,
            "%(cluster_options)s"]

        for resource in PARAMS["cluster_memory_resource"].split(","):
            spec.append("-l %s=%s" % (resource, job_memory))

        # if process has multiple threads, use a parallel environment
        if 'job_threads' in options:
            spec.append(
                "-pe %(cluster_parallel_environment)s %(job_threads)i -R y")
        if "cluster_pe_queue" in options and 'job_threads' in options:
                spec.append(
                    "-q %(cluster_pe_queue)s")
        else:
            spec.append("-q %(cluster_queue)s")

        jt.nativeSpecification = " ".join(spec) % options
        # keep stdout and stderr separate
        jt.joinFiles = False

        E.debug("Job spec is: %s" % jt.nativeSpecification)

        return jt

    shellfile = os.path.join(PARAMS["workingdir"], "shell.log")

    pid = os.getpid()
    E.debug('task: pid = %i' % pid)

    # connect to global session
    session = GLOBAL_SESSION
    E.debug('task: pid %i: sge session = %s' % (pid, str(session)))

    ignore_pipe_errors = options.get('ignore_pipe_errors', False)
    ignore_errors = options.get('ignore_errors', False)

    # run on cluster if:
    # * to_cluster is not defined or set to True
    # * command line option without_cluster is set to False
    # * an SGE session is present
    run_on_cluster = ("to_cluster" not in options or
                      options.get("to_cluster")) and \
        not options["without_cluster"] and \
        GLOBAL_SESSION is not None

    # SGE compatible job_name
    job_name = re.sub(
        "[:]", "_",
        os.path.basename(options.get("outfile", "ruffus")))

    def buildJobScript(statement, job_memory, job_name):
        '''build job script from statement.

        returns (name_of_script, stdout_path, stderr_path)
        '''

        tmpfile = getTempFile(dir=PARAMS["workingdir"])
        # disabled: -l -O expand_aliases\n" )
        tmpfile.write("#!/bin/bash\n")
        tmpfile.write(
            'echo "%s : START -> %s" >> %s\n' %
            (job_name, tmpfile.name, shellfile))
        # disabled - problems with quoting
        # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
        # (shellquote(statement), shellfile) )
        tmpfile.write("set | sed 's/^/%s : /' &>> %s\n" %
                      (job_name, shellfile))
        # module list outputs to stderr, so merge stderr and stdout
        tmpfile.write("module list 2>&1 | sed 's/^/%s: /' &>> %s\n" %
                      (job_name, shellfile))
        tmpfile.write("hostname | sed 's/^/%s: /' &>> %s\n" %
                      (job_name, shellfile))
        tmpfile.write("cat /proc/meminfo | sed 's/^/%s: /' &>> %s\n" %
                      (job_name, shellfile))
        tmpfile.write(
            'echo "%s : END -> %s" >> %s\n' %
            (job_name, tmpfile.name, shellfile))

        # restrict virtual memory
        # Note that there are resources in SGE which could do this directly
        # such as v_hmem.
        # Note that limiting resident set sizes (RSS) with ulimit is not
        # possible in newer kernels.
        tmpfile.write("ulimit -v %i\n" % IOTools.human2bytes(job_memory))

        tmpfile.write(
            expandStatement(
                statement,
                ignore_pipe_errors=ignore_pipe_errors) + "\n")
        tmpfile.close()

        job_path = os.path.abspath(tmpfile.name)
        stdout_path = job_path + ".stdout"
        stderr_path = job_path + ".stderr"

        os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU)

        return (job_path, stdout_path, stderr_path)

    if run_on_cluster:
        # run multiple jobs
        if options.get("statements"):

            statement_list = []
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))

            if options.get("dryrun", False):
                return

            jt = setupJob(session, options, job_memory, job_name)

            job_ids, filenames = [], []
            for statement in statement_list:
                E.debug("running statement:\n%s" % statement)

                job_path, stdout_path, stderr_path = buildJobScript(statement,
                                                                    job_memory,
                                                                    job_name)

                jt.remoteCommand = job_path
                jt.outputPath = ":" + stdout_path
                jt.errorPath = ":" + stderr_path

                os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU)

                job_id = session.runJob(jt)
                job_ids.append(job_id)
                filenames.append((job_path, stdout_path, stderr_path))

                E.debug("job has been submitted with job_id %s" % str(job_id))

            E.debug("waiting for %i jobs to finish " % len(job_ids))
            session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER,
                                False)

            # collect and clean up
            for job_id, statement, paths in zip(job_ids, statement_list,
                                                filenames):
                job_path, stdout_path, stderr_path = paths
                _collectSingleJobFromCluster(session, job_id,
                                             statement,
                                             stdout_path,
                                             stderr_path,
                                             job_path,
                                             ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)

        # run single job on cluster - this can be an array job
        else:

            statement = buildStatement(**options)
            E.debug("running statement:\n%s" % statement)

            if options.get("dryrun", False):
                return

            job_path, stdout_path, stderr_path = buildJobScript(statement,
                                                                job_memory,
                                                                job_name)

            jt = setupJob(session, options, job_memory, job_name)

            jt.remoteCommand = job_path
            # later: allow redirection of stdout and stderr to files;
            # can even be across hosts?
            jt.outputPath = ":" + stdout_path
            jt.errorPath = ":" + stderr_path

            if "job_array" in options and options["job_array"] is not None:
                # run an array job
                start, end, increment = options.get("job_array")
                E.debug("starting an array job: %i-%i,%i" %
                        (start, end, increment))
                # sge works with 1-based, closed intervals
                job_ids = session.runBulkJobs(jt, start + 1, end, increment)
                E.debug("%i array jobs have been submitted as job_id %s" %
                        (len(job_ids), job_ids[0]))
                retval = session.synchronize(
                    job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True)

                stdout, stderr = getStdoutStderr(stdout_path, stderr_path)

            else:
                # run a single job
                job_id = session.runJob(jt)
                E.debug("job has been submitted with job_id %s" % str(job_id))

                _collectSingleJobFromCluster(session, job_id,
                                             statement,
                                             stdout_path,
                                             stderr_path,
                                             job_path,
                                             ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)
    else:
        # run job locally on cluster
        statement_list = []
        if options.get("statements"):
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))
        else:
            statement_list.append(buildStatement(**options))

        if options.get("dryrun", False):
            return

        for statement in statement_list:
            E.debug("running statement:\n%s" % statement)

            # process substitution <() and >() does not
            # work through subprocess directly. Thus,
            # the statement needs to be wrapped in
            # /bin/bash -c '...' in order for bash
            # to interpret the substitution correctly.
            if "<(" in statement:
                shell = os.environ.get('SHELL', "/bin/bash")
                if "bash" not in shell:
                    raise ValueError(
                        "require bash for advanced shell syntax: <()")
                # Note: pipes.quote is deprecated in Py3, use shlex.quote
                # (not present in Py2.7).
                statement = pipes.quote(statement)
                statement = "%s -c %s" % (shell, statement)

            process = subprocess.Popen(
                expandStatement(
                    statement,
                    ignore_pipe_errors=ignore_pipe_errors),
                cwd=PARAMS["workingdir"],
                shell=True,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)

            # process.stdin.close()
            stdout, stderr = process.communicate()

            if process.returncode != 0 and not ignore_errors:
                raise OSError(
                    "---------------------------------------\n"
                    "Child was terminated by signal %i: \n"
                    "The stderr was: \n%s\n%s\n"
                    "-----------------------------------------" %
                    (-process.returncode, stderr, statement))