def main(argv=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ if argv is None: argv = sys.argv if GLOBAL_OPTIONS is None: options, args = initialize(caller=get_caller().__file__) else: options, args = GLOBAL_OPTIONS, GLOBAL_ARGS run_workflow(options, args)
def get_version(): # get script that has called P.main() code_location = os.path.abspath(os.path.dirname(get_caller(1).__file__)) # try git for runs from repository stdout = E.run("git rev-parse HEAD 2> /dev/null", cwd=code_location, return_stdout=True, on_error="ignore").strip() return VersionData(code_location=code_location, version=stdout)
def initialize(argv=None, caller=None, defaults=None, **kwargs): """setup the pipeline framework. Arguments --------- options: object Container for command line arguments. args : list List of command line arguments. defaults : dictionary Dictionary with default values to be added to global parameters dictionary. Additional keyword arguments will be passed to the :func:`~.parse_commandline` function to set command-line defaults. """ if argv is None: argv = sys.argv # load default options from config files if caller: path = os.path.splitext(caller)[0] else: try: path = os.path.splitext(get_caller().__file__)[0] except AttributeError as ex: path = "unknown" options, args = parse_commandline(argv, **kwargs) get_parameters([ os.path.join(path, "pipeline.yml"), "../pipeline.yml", options.config_file ], defaults=defaults) global GLOBAL_OPTIONS global GLOBAL_ARGS GLOBAL_OPTIONS, GLOBAL_ARGS = options, args logger = logging.getLogger("cgatcore.pipeline") logger.info("started in directory: {}".format( get_params().get("start_dir"))) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. update_params_with_commandline_options(get_params(), options) code_location, version = get_version() logger.info("code location: {}".format(code_location)) logger.info("code version: {}".format(version)) logger.info("working directory is: {}".format( get_params().get("work_dir"))) work_dir = get_params().get("work_dir") if not os.path.exists(work_dir): E.info( "working directory {} does not exist - creating".format(work_dir)) os.makedirs(work_dir) logger.info("changing directory to {}".format(work_dir)) os.chdir(work_dir) logger.info("pipeline has been initialized") return options, args
def run_workflow(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- pipeline: object pipeline to run. If not given, all ruffus pipelines are run. """ logger = logging.getLogger("cgatcore.pipeline") if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = ruffus.pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(get_params()["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( get_params()["tmpdir"])) try: os.makedirs(get_params()["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(get_params()["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(get_params(), sys.argv[0]) elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": if not options.without_cluster and not HAS_DRMAA and not get_params( )['testing']: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. opts = {"multithread": options.multiprocess} else: # use cooperative multitasking instead of multiprocessing. opts = { "multiprocess": options.multiprocess, "pool_manager": "gevent" } # create the session proxy start_session() logger.info("current directory is {}".format(os.getcwd())) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, **opts) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(get_params())) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") p = get_params() for k in sorted(get_params()): print(k, "=", p[k]) print_config_files() elif options.pipeline_action == "config": # Level needs to be 2: # 0th level -> cgatflow.py # 1st level -> Control.py # 2nd level -> pipeline_xyz.py f = sys._getframe(2) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
def run(statement, **kwargs): """run a command line statement. This function runs a single or multiple statements either locally or on the cluster using drmaa. How a statement is executed or how it is modified depends on the context. The context is provided by keyword arguments provided as named function arguments ('kwargs') but also from defaults (see below). The following keyword arguments are recognized: job_memory memory to use for the job per thread. Memory specification should be in a format that is accepted by the job scheduler. Note that memory is per thread. If you have 6 threads and the total memory is 6Gb, use 1G as job_memory. job_total_memory total memory to use for a job. This will be divided by the number of threads. job_threads number of threads to request for the job. job_options options to the job scheduler. job_condaenv conda environment to use for the job. job_array if set, run statement as an array job. Job_array should be tuple with start, end, and increment. In addition, any additional variables will be used to interpolate the command line string using python's '%' string interpolation operator. The context is build in a hierarchical manner with successive operations overwriting previous values. 1. Global variables The context is initialized with system-wide defaults stored in the global PARAMS singleton. 2. Context of caller The context of the calling function is examined and any local variables defined in this context are added. 3. kwargs Any options given explicitely as options to the run() method are added. 4. params If the context of the calling function contains a params variable, its contents are added to the context. This permits setting variables in configuration files in TaskLibrary functions. By default, a job is sent to the cluster, unless: * ``to_cluster`` is present and set to None. * ``without_cluster`` is True. * ``--local`` has been specified on the command line and the option ``without_cluster`` has been set as a result. * no libdrmaa is present * the global session is not initialized (GLOBAL_SESSION is None) Troubleshooting: 1. DRMAA creates sessions and their is a limited number of sessions available. If there are two many or sessions become not available after failed jobs, use ``qconf -secl`` to list sessions and ``qconf -kec #`` to delete sessions. 2. Memory: 1G of free memory can be requested using the job_memory variable: ``job_memory = "1G"`` If there are error messages like "no available queue", then the problem could be that a particular complex attribute has not been defined (the code should be ``hc`` for ``host:complex`` and not ``hl`` for ``host:local``. Note that qrsh/qsub directly still works. The job will be executed within PARAMS["work_dir"], unless PARAMS["work_dir"] is not local. In that case, the job will be executed in a shared temporary directory. Arguments --------- statement : string or list of strings A command line statement or a list of command line statements to be executed. kwargs : dictionary Context for job. The context is used to interpolate the command line statement. """ logger = get_logger() # combine options using priority options = dict(list(get_params().items())) caller_options = get_caller_locals() options.update(list(caller_options.items())) if "self" in options: del options["self"] options.update(list(kwargs.items())) # inject params named tuple from TaskLibrary functions into option # dict. This allows overriding options set in the code with options set # in a .yml file if "params" in options: try: options.update(options["params"]._asdict()) except AttributeError: pass # insert parameters supplied through simplified interface such # as job_memory, job_options, job_queue options['cluster']['options'] = options.get('job_options', options['cluster']['options']) options['cluster']['queue'] = options.get('job_queue', options['cluster']['queue']) options['without_cluster'] = options.get('without_cluster') # SGE compatible job_name name_substrate = str(options.get("outfile", "cgatcore")) if os.path.basename(name_substrate).startswith("result"): name_substrate = os.path.basename(os.path.dirname(name_substrate)) else: name_substrate = os.path.basename(name_substrate) options["job_name"] = re.sub("[:]", "_", name_substrate) try: calling_module = get_caller().__name__ except AttributeError: calling_module = "unknown" options["task_name"] = calling_module + "." + get_calling_function() # build statements using parameter interpolation if isinstance(statement, list): statement_list = [] for stmt in statement: statement_list.append(interpolate_statement(stmt, options)) else: statement_list = [interpolate_statement(statement, options)] if len(statement_list) == 0: logger.warn("no statements found - no execution") return [] if options.get("dryrun", False): for statement in statement_list: logger.info("dry-run: {}".format(statement)) return [] # execute statement list runner = make_runner(**options) with runner as r: benchmark_data = r.run(statement_list) # log benchmark_data for data in benchmark_data: logger.info(json.dumps(data)) BenchmarkData = collections.namedtuple('BenchmarkData', sorted(benchmark_data[0])) return [BenchmarkData(**d) for d in benchmark_data]