Ejemplo n.º 1
0
def main(argv=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    if argv is None:
        argv = sys.argv

    if GLOBAL_OPTIONS is None:
        options, args = initialize(caller=get_caller().__file__)
    else:
        options, args = GLOBAL_OPTIONS, GLOBAL_ARGS

    run_workflow(options, args)
Ejemplo n.º 2
0
def get_version():
    # get script that has called P.main()
    code_location = os.path.abspath(os.path.dirname(get_caller(1).__file__))
    # try git for runs from repository
    stdout = E.run("git rev-parse HEAD 2> /dev/null",
                   cwd=code_location,
                   return_stdout=True,
                   on_error="ignore").strip()
    return VersionData(code_location=code_location, version=stdout)
Ejemplo n.º 3
0
def initialize(argv=None, caller=None, defaults=None, **kwargs):
    """setup the pipeline framework.

    Arguments
    ---------
    options: object
        Container for command line arguments.
    args : list
        List of command line arguments.
    defaults : dictionary
        Dictionary with default values to be added to global
        parameters dictionary.

    Additional keyword arguments will be passed to the
    :func:`~.parse_commandline` function to set command-line defaults.

    """
    if argv is None:
        argv = sys.argv

    # load default options from config files
    if caller:
        path = os.path.splitext(caller)[0]
    else:
        try:
            path = os.path.splitext(get_caller().__file__)[0]
        except AttributeError as ex:
            path = "unknown"

    options, args = parse_commandline(argv, **kwargs)

    get_parameters([
        os.path.join(path, "pipeline.yml"), "../pipeline.yml",
        options.config_file
    ],
                   defaults=defaults)

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS
    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    logger = logging.getLogger("cgatcore.pipeline")

    logger.info("started in directory: {}".format(
        get_params().get("start_dir")))

    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.
    update_params_with_commandline_options(get_params(), options)

    code_location, version = get_version()
    logger.info("code location: {}".format(code_location))
    logger.info("code version: {}".format(version))

    logger.info("working directory is: {}".format(
        get_params().get("work_dir")))
    work_dir = get_params().get("work_dir")
    if not os.path.exists(work_dir):
        E.info(
            "working directory {} does not exist - creating".format(work_dir))
        os.makedirs(work_dir)
    logger.info("changing directory to {}".format(work_dir))
    os.chdir(work_dir)

    logger.info("pipeline has been initialized")

    return options, args
Ejemplo n.º 4
0
def run_workflow(options, args, pipeline=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    pipeline: object
        pipeline to run. If not given, all ruffus pipelines are run.

    """
    logger = logging.getLogger("cgatcore.pipeline")

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    if options.force_run:
        if options.force_run == "all":
            forcedtorun_tasks = ruffus.pipeline_get_task_names()
        else:
            forcedtorun_tasks = options.pipeline_targets
    else:
        forcedtorun_tasks = []

    # create local scratch if it does not already exists. Note that
    # directory itself will be not deleted while its contents should
    # be cleaned up.
    if not os.path.exists(get_params()["tmpdir"]):
        logger.warn(
            "local temporary directory {} did not exist - created".format(
                get_params()["tmpdir"]))
        try:
            os.makedirs(get_params()["tmpdir"])
        except OSError:
            # file exists
            pass

    logger.debug("temporary directory is {}".format(get_params()["tmpdir"]))

    # set multiprocess to a sensible setting if there is no cluster
    run_on_cluster = HAS_DRMAA is True and not options.without_cluster
    if options.multiprocess is None:
        if not run_on_cluster:
            options.multiprocess = int(
                math.ceil(multiprocessing.cpu_count() / 2.0))
        else:
            options.multiprocess = 40

    # see inputValidation function in Parameters.py
    if options.input_validation:
        input_validation(get_params(), sys.argv[0])

    elif options.pipeline_action == "debug":
        # create the session proxy
        start_session()

        method_name = options.pipeline_targets[0]
        caller = get_caller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "state", "svg", "plot",
                                     "dot", "touch", "regenerate"):

        messenger = None
        try:
            with cache_os_functions():
                if options.pipeline_action == "make":

                    if not options.without_cluster and not HAS_DRMAA and not get_params(
                    )['testing']:
                        E.critical(
                            "DRMAA API not found so cannot talk to a cluster.")
                        E.critical("Please use --local to run the pipeline"
                                   " on this host: {}".format(os.uname()[1]))
                        sys.exit(-1)

                    # get tasks to be done. This essentially replicates
                    # the state information within ruffus.
                    stream = StringIO()
                    ruffus.pipeline_printout(
                        stream,
                        options.pipeline_targets,
                        verbose=5,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                    messenger = LoggingFilterProgress(stream.getvalue())
                    logger.addFilter(messenger)

                    global task
                    if options.without_cluster:
                        # use ThreadPool to avoid taking multiple CPU for pipeline
                        # controller.
                        opts = {"multithread": options.multiprocess}
                    else:
                        # use cooperative multitasking instead of multiprocessing.
                        opts = {
                            "multiprocess": options.multiprocess,
                            "pool_manager": "gevent"
                        }
                        # create the session proxy
                        start_session()

                    logger.info("current directory is {}".format(os.getcwd()))

                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        logger=logger,
                        verbose=options.loglevel,
                        log_exceptions=options.log_exceptions,
                        exceptions_terminate_immediately=options.
                        exceptions_terminate_immediately,
                        checksum_level=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        one_second_per_job=False,
                        **opts)

                    close_session()

                elif options.pipeline_action == "show":
                    ruffus.pipeline_printout(
                        options.stdout,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "touch":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=True,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "regenerate":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        verbose=options.loglevel)

                elif options.pipeline_action == "svg":
                    ruffus.pipeline_printout_graph(
                        options.stdout.buffer,
                        options.pipeline_format,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "state":
                    ruffus.ruffus_return_dag(
                        options.stdout,
                        target_tasks=options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "plot":
                    outf, filename = tempfile.mkstemp()
                    ruffus.pipeline_printout_graph(
                        os.fdopen(outf, "wb"),
                        options.pipeline_format,
                        options.pipeline_targets,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)
                    execute("inkscape %s" % filename)
                    os.unlink(filename)

        except ruffus.ruffus_exceptions.RethrownJobError as ex:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(ex.args))
                for idx, e in enumerate(ex.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub(r"\s", "", job)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.pipeline_logfile)

                logger.error("start of all error messages")
                logger.error(ex)
                logger.error("end of all error messages")

                raise ValueError("pipeline failed with %i errors" %
                                 len(ex.args)) from ex
            else:
                raise

    elif options.pipeline_action == "dump":
        options.stdout.write((json.dumps(get_params())) + "\n")

    elif options.pipeline_action == "printconfig":
        E.info("printing out pipeline parameters: ")
        p = get_params()
        for k in sorted(get_params()):
            print(k, "=", p[k])
        print_config_files()

    elif options.pipeline_action == "config":
        # Level needs to be 2:
        # 0th level -> cgatflow.py
        # 1st level -> Control.py
        # 2nd level -> pipeline_xyz.py
        f = sys._getframe(2)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        write_config_files(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clone_pipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.stop(logger=get_logger())
Ejemplo n.º 5
0
def run(statement, **kwargs):
    """run a command line statement.

    This function runs a single or multiple statements either locally
    or on the cluster using drmaa. How a statement is executed or how
    it is modified depends on the context.

    The context is provided by keyword arguments provided as named
    function arguments ('kwargs') but also from defaults (see
    below). The following keyword arguments are recognized:

    job_memory
        memory to use for the job per thread. Memory specification should be in a
        format that is accepted by the job scheduler. Note that memory
        is per thread. If you have 6 threads and the total memory is
        6Gb, use 1G as job_memory.
    job_total_memory
        total memory to use for a job. This will be divided by the number of
        threads.
    job_threads
        number of threads to request for the job.
    job_options
        options to the job scheduler.
    job_condaenv
        conda environment to use for the job.
    job_array
        if set, run statement as an array job. Job_array should be
        tuple with start, end, and increment.

    In addition, any additional variables will be used to interpolate
    the command line string using python's '%' string interpolation
    operator.

    The context is build in a hierarchical manner with successive
    operations overwriting previous values.

    1. Global variables
       The context is initialized
       with system-wide defaults stored in the global PARAMS
       singleton.
    2. Context of caller
       The context of the calling function is examined
       and any local variables defined in this context are added.
    3. kwargs
       Any options given explicitely as options to the run() method
       are added.
    4. params
       If the context of the calling function contains a params
       variable, its contents are added to the context. This permits
       setting variables in configuration files in TaskLibrary
       functions.

    By default, a job is sent to the cluster, unless:

        * ``to_cluster`` is present and set to None.

        * ``without_cluster`` is True.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``. Note that qrsh/qsub directly
          still works.

    The job will be executed within PARAMS["work_dir"], unless
    PARAMS["work_dir"] is not local. In that case, the job will
    be executed in a shared temporary directory.

    Arguments
    ---------
    statement : string or list of strings
        A command line statement or a list of command line statements
        to be executed.
    kwargs : dictionary
        Context for job. The context is used to interpolate the command
        line statement.

    """
    logger = get_logger()

    # combine options using priority
    options = dict(list(get_params().items()))
    caller_options = get_caller_locals()
    options.update(list(caller_options.items()))

    if "self" in options:
        del options["self"]
    options.update(list(kwargs.items()))

    # inject params named tuple from TaskLibrary functions into option
    # dict. This allows overriding options set in the code with options set
    # in a .yml file
    if "params" in options:
        try:
            options.update(options["params"]._asdict())
        except AttributeError:
            pass

    # insert parameters supplied through simplified interface such
    # as job_memory, job_options, job_queue
    options['cluster']['options'] = options.get('job_options',
                                                options['cluster']['options'])
    options['cluster']['queue'] = options.get('job_queue',
                                              options['cluster']['queue'])
    options['without_cluster'] = options.get('without_cluster')

    # SGE compatible job_name
    name_substrate = str(options.get("outfile", "cgatcore"))
    if os.path.basename(name_substrate).startswith("result"):
        name_substrate = os.path.basename(os.path.dirname(name_substrate))
    else:
        name_substrate = os.path.basename(name_substrate)

    options["job_name"] = re.sub("[:]", "_", name_substrate)
    try:
        calling_module = get_caller().__name__
    except AttributeError:
        calling_module = "unknown"

    options["task_name"] = calling_module + "." + get_calling_function()

    # build statements using parameter interpolation
    if isinstance(statement, list):
        statement_list = []
        for stmt in statement:
            statement_list.append(interpolate_statement(stmt, options))
    else:
        statement_list = [interpolate_statement(statement, options)]

    if len(statement_list) == 0:
        logger.warn("no statements found - no execution")
        return []

    if options.get("dryrun", False):
        for statement in statement_list:
            logger.info("dry-run: {}".format(statement))
        return []

    # execute statement list
    runner = make_runner(**options)
    with runner as r:
        benchmark_data = r.run(statement_list)

    # log benchmark_data
    for data in benchmark_data:
        logger.info(json.dumps(data))

    BenchmarkData = collections.namedtuple('BenchmarkData',
                                           sorted(benchmark_data[0]))
    return [BenchmarkData(**d) for d in benchmark_data]