Beispiel #1
0
def execute(statement, **kwargs):
    '''execute a statement locally.

    This method implements the same parameter interpolation
    as the function :func:`run`.

    Arguments
    ---------
    statement : string
        Command line statement to run.

    Returns
    -------
    stdout : string
        Data sent to standard output by command
    stderr : string
        Data sent to standard error by command
    '''

    if not kwargs:
        kwargs = get_caller_locals()

    kwargs = dict(list(get_params().items()) + list(kwargs.items()))

    logger = get_logger()
    logger.info("running %s" % (statement % kwargs))

    if "cwd" not in kwargs:
        cwd = get_params()["work_dir"]
    else:
        cwd = kwargs["cwd"]

    # cleaning up of statement
    # remove new lines and superfluous spaces and tabs
    statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip()
    if statement.endswith(";"):
        statement = statement[:-1]

    # always use bash
    os.environ.update(
        {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')})
    process = subprocess.Popen(statement % kwargs,
                               cwd=cwd,
                               shell=True,
                               stdin=sys.stdin,
                               stdout=sys.stdout,
                               stderr=sys.stderr,
                               env=os.environ.copy(),
                               executable="/bin/bash")

    # process.stdin.close()
    stdout, stderr = process.communicate()

    if process.returncode != 0:
        raise OSError(
            "Child was terminated by signal %i: \n"
            "The stderr was: \n%s\n%s\n" %
            (-process.returncode, stderr, statement))

    return stdout, stderr
Beispiel #2
0
def get_parameters(filenames=None,
                   defaults=None,
                   site_ini=True,
                   user=True,
                   only_import=None):
    '''read one or more config files and build global PARAMS configuration
    dictionary.

    Arguments
    ---------
    filenames : list
       List of filenames of the configuration files to read.
    defaults : dict
       Dictionary with default values. These will be overwrite
       any hard-coded parameters, but will be overwritten by user
       specified parameters in the configuration files.
    user : bool
       If set, configuration files will also be read from a
       file called :file:`.cgat.yml` in the user`s
       home directory.
    only_import : bool
       If set to a boolean, the parameter dictionary will be a
       defaultcollection. This is useful for pipelines that are
       imported (for example for documentation generation) but not
       executed as there might not be an appropriate .yml file
       available. If `only_import` is None, it will be set to the
       default, which is to raise an exception unless the calling
       script is imported or the option ``--is-test`` has been passed
       at the command line.

    Returns
    -------
    params : dict
       Global configuration dictionary.
    '''
    global PARAMS, HAVE_INITIALIZED
    # only execute function once
    if HAVE_INITIALIZED:
        return PARAMS

    if filenames is None:
        filenames = ["pipeline.yml"]

    if isinstance(filenames, str):
        filenames = [filenames]

    old_id = id(PARAMS)

    caller_locals = get_caller_locals()

    # check if this is only for import
    if only_import is None:
        only_import = is_test() or "__name__" not in caller_locals or \
                      caller_locals["__name__"] != "__main__"

    # important: only update the PARAMS variable as
    # it is referenced in other modules. Thus the type
    # needs to be fixed at import. Raise error where this
    # is not the case.
    # Note: Parameter sharing in the pipeline module needs
    # to be reorganized.
    if only_import:
        # turn on default dictionary
        TriggeredDefaultFactory.with_default = True

    # check if the pipeline is in testing mode
    found = False
    if 'argv' in caller_locals and caller_locals['argv'] is not None:
        for e in caller_locals['argv']:
            if 'template_pipeline.py' in e:
                found = True
    PARAMS['testing'] = 'self' in caller_locals or found

    if site_ini:
        # read configuration from /etc/cgat/pipeline.yml
        fn = "/etc/cgat/pipeline.yml"
        if os.path.exists(fn):
            filenames.insert(0, fn)

    if user:
        # read configuration from a users home directory
        fn = os.path.join(os.path.expanduser("~"),
                          ".cgat.yml")
        if os.path.exists(fn):
            if 'pipeline.yml' in filenames:
                index = filenames.index('pipeline.yml')
                filenames.insert(index, fn)
            else:
                filenames.append(fn)

    filenames = [x.strip() for x in filenames if os.path.exists(x)]

    # save list of config files
    PARAMS["pipeline_yml"] = filenames

    # update with hard-coded PARAMS
    nested_update(PARAMS, HARDCODED_PARAMS)
    if defaults:
        nested_update(PARAMS, defaults)

    # reset working directory. Set in PARAMS to prevent repeated calls to
    # os.getcwd() failing if network is busy
    PARAMS["start_dir"] = os.path.abspath(os.getcwd())
    # location of pipelines - set via location of top frame (cgatflow command)
    if '__file__' in caller_locals:
        PARAMS["pipelinedir"] = os.path.dirname(caller_locals["__file__"])
    else:
        PARAMS["pipelinedir"] = 'unknown'

    for filename in filenames:
        if not os.path.exists(filename):
            continue
        get_logger().info("reading config from file {}".format(
            filename))

        with open(filename) as inf:
            p = yaml.load(inf)
            if p:
                nested_update(PARAMS, p)

    # for backwards compatibility - normalize dictionaries
    p = {}
    for k, v in PARAMS.items():
        if isinstance(v, collections.Mapping):
            for kk, vv in v.items():
                new_key = "{}_{}".format(k, kk)
                if new_key in p:
                    raise ValueError(
                        "key {} does already exist".format(new_key))
                p[new_key] = vv
    nested_update(PARAMS, p)

    # interpolate some params with other parameters
    for param in INTERPOLATE_PARAMS:
        try:
            PARAMS[param] = PARAMS[param] % PARAMS
        except TypeError as msg:
            raise TypeError('could not interpolate %s: %s' %
                            (PARAMS[param], msg))

    # expand directory pathnames
    for param, value in list(PARAMS.items()):
        if (param.endswith("dir") and isinstance(value, str) and value.startswith(".")):
            PARAMS[param] = os.path.abspath(value)

    # make sure that the dictionary reference has not changed
    assert id(PARAMS) == old_id
    HAVE_INITIALIZED = True
    return PARAMS
Beispiel #3
0
def peek_parameters(workingdir,
                    pipeline,
                    on_error_raise=None,
                    prefix=None,
                    update_interface=False,
                    restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = get_caller_locals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not is_test() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if ("config" in sys.argv or "check" in sys.argv
            or "clone" in sys.argv and workingdir == "?!"):
        workingdir = os.path.join(get_params()["pipelinedir"],
                                  "pipeline_" + pipeline)

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError("can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "cgatflow {} -v 0 dump".format(pipeline)

    os.environ.update(
        {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')})
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") % (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                if isinstance(value, str):
                    dump[key] = os.path.join(workingdir, value)
                elif isinstance(value, collections.Mapping):
                    for kkey, vvalue in list(value.items()):
                        value[key] = os.path.join(workingdir, vvalue)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Beispiel #4
0
def run(statement, **kwargs):
    """run a command line statement.

    This function runs a single or multiple statements either locally
    or on the cluster using drmaa. How a statement is executed or how
    it is modified depends on the context.

    The context is provided by keyword arguments provided as named
    function arguments ('kwargs') but also from defaults (see
    below). The following keyword arguments are recognized:

    job_memory
        memory to use for the job per thread. Memory specification should be in a
        format that is accepted by the job scheduler. Note that memory
        is per thread. If you have 6 threads and the total memory is
        6Gb, use 1G as job_memory.
    job_total_memory
        total memory to use for a job. This will be divided by the number of
        threads.
    job_threads
        number of threads to request for the job.
    job_options
        options to the job scheduler.
    job_condaenv
        conda environment to use for the job.
    job_array
        if set, run statement as an array job. Job_array should be
        tuple with start, end, and increment.

    In addition, any additional variables will be used to interpolate
    the command line string using python's '%' string interpolation
    operator.

    The context is build in a hierarchical manner with successive
    operations overwriting previous values.

    1. Global variables
       The context is initialized
       with system-wide defaults stored in the global PARAMS
       singleton.
    2. Context of caller
       The context of the calling function is examined
       and any local variables defined in this context are added.
    3. kwargs
       Any options given explicitely as options to the run() method
       are added.
    4. params
       If the context of the calling function contains a params
       variable, its contents are added to the context. This permits
       setting variables in configuration files in TaskLibrary
       functions.

    By default, a job is sent to the cluster, unless:

        * ``to_cluster`` is present and set to None.

        * ``without_cluster`` is True.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``. Note that qrsh/qsub directly
          still works.

    The job will be executed within PARAMS["work_dir"], unless
    PARAMS["work_dir"] is not local. In that case, the job will
    be executed in a shared temporary directory.

    Arguments
    ---------
    statement : string or list of strings
        A command line statement or a list of command line statements
        to be executed.
    kwargs : dictionary
        Context for job. The context is used to interpolate the command
        line statement.

    """
    logger = get_logger()

    # combine options using priority
    options = dict(list(get_params().items()))
    caller_options = get_caller_locals()
    options.update(list(caller_options.items()))

    if "self" in options:
        del options["self"]
    options.update(list(kwargs.items()))

    # inject params named tuple from TaskLibrary functions into option
    # dict. This allows overriding options set in the code with options set
    # in a .yml file
    if "params" in options:
        try:
            options.update(options["params"]._asdict())
        except AttributeError:
            pass

    # insert parameters supplied through simplified interface such
    # as job_memory, job_options, job_queue
    options['cluster']['options'] = options.get('job_options',
                                                options['cluster']['options'])
    options['cluster']['queue'] = options.get('job_queue',
                                              options['cluster']['queue'])
    options['without_cluster'] = options.get('without_cluster')

    # SGE compatible job_name
    name_substrate = str(options.get("outfile", "cgatcore"))
    if os.path.basename(name_substrate).startswith("result"):
        name_substrate = os.path.basename(os.path.dirname(name_substrate))
    else:
        name_substrate = os.path.basename(name_substrate)

    options["job_name"] = re.sub("[:]", "_", name_substrate)
    try:
        calling_module = get_caller().__name__
    except AttributeError:
        calling_module = "unknown"

    options["task_name"] = calling_module + "." + get_calling_function()

    # build statements using parameter interpolation
    if isinstance(statement, list):
        statement_list = []
        for stmt in statement:
            statement_list.append(interpolate_statement(stmt, options))
    else:
        statement_list = [interpolate_statement(statement, options)]

    if len(statement_list) == 0:
        logger.warn("no statements found - no execution")
        return []

    if options.get("dryrun", False):
        for statement in statement_list:
            logger.info("dry-run: {}".format(statement))
        return []

    # execute statement list
    runner = make_runner(**options)
    with runner as r:
        benchmark_data = r.run(statement_list)

    # log benchmark_data
    for data in benchmark_data:
        logger.info(json.dumps(data))

    BenchmarkData = collections.namedtuple('BenchmarkData',
                                           sorted(benchmark_data[0]))
    return [BenchmarkData(**d) for d in benchmark_data]