def execute(statement, **kwargs): '''execute a statement locally. This method implements the same parameter interpolation as the function :func:`run`. Arguments --------- statement : string Command line statement to run. Returns ------- stdout : string Data sent to standard output by command stderr : string Data sent to standard error by command ''' if not kwargs: kwargs = getCallerLocals() kwargs = dict(list(PARAMS.items()) + list(kwargs.items())) E.info("running %s" % (statement % kwargs)) if "cwd" not in kwargs: cwd = PARAMS["workingdir"] else: cwd = kwargs["cwd"] # cleaning up of statement # remove new lines and superfluous spaces and tabs statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip() if statement.endswith(";"): statement = statement[:-1] process = subprocess.Popen(statement % kwargs, cwd=cwd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( "Child was terminated by signal %i: \n" "The stderr was: \n%s\n%s\n" % (-process.returncode, stderr, statement)) return stdout, stderr
def run(**kwargs): """run a command line statement. The method runs a single or multiple statements on the cluster using drmaa. The cluster is bypassed if: * ``to_cluster`` is set to None in the context of the calling function. * ``--local`` has been specified on the command line and the option ``without_cluster`` has been set as a result. * no libdrmaa is present * the global session is not initialized (GLOBAL_SESSION is None) To decide which statement to run, the method works by examining the context of the calling function for a variable called ``statement`` or ``statements``. If ``statements`` is defined, multiple job scripts are created and sent to the cluster. If ``statement`` is defined, a single job script is created and sent to the cluster. Additionally, if ``job_array`` is defined, the single statement will be submitted as an array job. Troubleshooting: 1. DRMAA creates sessions and their is a limited number of sessions available. If there are two many or sessions become not available after failed jobs, use ``qconf -secl`` to list sessions and ``qconf -kec #`` to delete sessions. 2. Memory: 1G of free memory can be requested using the job_memory variable: ``job_memory = "1G"`` If there are error messages like "no available queue", then the problem could be that a particular complex attribute has not been defined (the code should be ``hc`` for ``host:complex`` and not ``hl`` for ``host:local``). Note that qrsh/qsub directly still works. """ # combine options using correct preference options = dict(list(PARAMS.items())) options.update(list(getCallerLocals().items())) options.update(list(kwargs.items())) # insert legacy synonyms options['without_cluster'] = options.get('without_cluster') getParallelEnvironment(options) # enforce highest priority for cluster options in command-line if "cli_cluster_memory_default" in PARAMS: options["cluster_memory_default"] = PARAMS[ "cli_cluster_memory_default"] if "cli_cluster_memory_resource" in PARAMS: options["cluster_memory_resource"] = PARAMS[ "cli_cluster_memory_resource"] if "cli_cluster_num_jobs" in PARAMS: options["cluster_num_jobs"] = PARAMS["cli_cluster_num_jobs"] if "cli_cluster_options" in PARAMS: options["cluster_options"] = PARAMS["cli_cluster_options"] if "cli_cluster_parallel_environment" in PARAMS: options["cluster_parallel_environment"] = PARAMS[ "cli_cluster_parallel_environment"] if "cli_cluster_priority" in PARAMS: options["cluster_priority"] = PARAMS["cli_cluster_priority"] if "cli_cluster_queue" in PARAMS: options["cluster_queue"] = PARAMS["cli_cluster_queue"] if "cli_cluster_queue_manager" in PARAMS: options["cluster_queue_manager"] = PARAMS["cli_cluster_queue_manager"] # if the command-line has not been used # get information from the legacy job_options if options["cluster_options"] == "": options["cluster_options"] = options.get("job_options", options["cluster_options"]) # get the memory requirement for the job job_memory = getJobMemory(options, PARAMS) # get the queue manager queue_manager = PARAMS["cluster_queue_manager"] shellfile = os.path.join(PARAMS["workingdir"], "shell.log") pid = os.getpid() E.debug('task: pid = %i' % pid) # connect to global session session = GLOBAL_SESSION E.debug('task: pid %i: sge session = %s' % (pid, str(session))) ignore_pipe_errors = options.get('ignore_pipe_errors', False) ignore_errors = options.get('ignore_errors', False) # run on cluster if: # * to_cluster is not defined or set to True # * command line option without_cluster is set to False # * an SGE session is present run_on_cluster = ("to_cluster" not in options or options.get("to_cluster")) and \ not options["without_cluster"] and \ GLOBAL_SESSION is not None # SGE compatible job_name job_name = re.sub("[:]", "_", os.path.basename(options.get("outfile", "ruffus"))) def _writeJobScript(statement, job_memory, job_name, shellfile): # disabled - problems with quoting # tmpfile.write( '''echo 'statement=%s' >> %s\n''' % # (shellquote(statement), shellfile) ) # module list outputs to stderr, so merge stderr and stdout script = '''#!/bin/bash -e \n echo "%(job_name)s : START -> ${0}" >> %(shellfile)s set | sed 's/^/%(job_name)s : /' &>> %(shellfile)s set +o errexit module list 2>&1 | sed 's/^/%(job_name)s: /' &>> %(shellfile)s set -o errexit hostname | sed 's/^/%(job_name)s: /' &>> %(shellfile)s cat /proc/meminfo | sed 's/^/%(job_name)s: /' &>> %(shellfile)s echo "%(job_name)s : END -> ${0}" >> %(shellfile)s ''' % locals() # restrict virtual memory # Note that there are resources in SGE which could do this directly # such as v_hmem. # Note that limiting resident set sizes (RSS) with ulimit is not # possible in newer kernels. script += "ulimit -v %i\n" % IOTools.human2bytes(job_memory) script += expandStatement(statement, ignore_pipe_errors=ignore_pipe_errors) script += "\n" job_path = getTempFilename(dir=PARAMS["workingdir"]) with open(job_path, "w") as script_file: script_file.write(script) return (job_path) if run_on_cluster: # run multiple jobs if options.get("statements"): statement_list = [] for statement in options.get("statements"): options["statement"] = statement statement_list.append(buildStatement(**options)) if options.get("dryrun", False): return jt = setupDrmaaJobTemplate(session, options, job_name, job_memory) E.debug("Job spec is: %s" % jt.nativeSpecification) job_ids, filenames = [], [] for statement in statement_list: E.info("running statement:\n%s" % statement) job_path = _writeJobScript(statement, job_memory, job_name, shellfile) jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path) job_id = session.runJob(jt) job_ids.append(job_id) filenames.append((job_path, stdout_path, stderr_path)) E.debug("job has been submitted with job_id %s" % str(job_id)) E.debug("waiting for %i jobs to finish " % len(job_ids)) session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, False) # collect and clean up for job_id, statement, paths in zip(job_ids, statement_list, filenames): job_path, stdout_path, stderr_path = paths collectSingleJobFromCluster(session, job_id, statement, stdout_path, stderr_path, job_path, ignore_errors=ignore_errors) session.deleteJobTemplate(jt) # run single job on cluster - this can be an array job else: statement = buildStatement(**options) E.info("running statement:\n%s" % statement) if options.get("dryrun", False): return jt = setupDrmaaJobTemplate(session, options, job_name, job_memory) E.debug("Job spec is: %s" % jt.nativeSpecification) job_path = _writeJobScript(statement, job_memory, job_name, shellfile) jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path) if "job_array" in options and options["job_array"] is not None: # run an array job start, end, increment = options.get("job_array") E.debug("starting an array job: %i-%i,%i" % (start, end, increment)) # sge works with 1-based, closed intervals job_ids = session.runBulkJobs(jt, start + 1, end, increment) E.debug("%i array jobs have been submitted as job_id %s" % (len(job_ids), job_ids[0])) retval = session.synchronize( job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True) stdout, stderr = getStdoutStderr(stdout_path, stderr_path) else: # run a single job job_id = session.runJob(jt) E.debug("job has been submitted with job_id %s" % str(job_id)) collectSingleJobFromCluster(session, job_id, statement, stdout_path, stderr_path, job_path, ignore_errors=ignore_errors) session.deleteJobTemplate(jt) else: # run job locally on cluster statement_list = [] if options.get("statements"): for statement in options.get("statements"): options["statement"] = statement statement_list.append(buildStatement(**options)) else: statement_list.append(buildStatement(**options)) if options.get("dryrun", False): return for statement in statement_list: E.info("running statement:\n%s" % statement) # process substitution <() and >() does not # work through subprocess directly. Thus, # the statement needs to be wrapped in # /bin/bash -c '...' in order for bash # to interpret the substitution correctly. if "<(" in statement or ">(" in statement: shell = os.environ.get('SHELL', "/bin/bash") if "bash" not in shell: raise ValueError( "require bash for advanced shell syntax: <()") # Note: pipes.quote is deprecated in Py3, use shlex.quote # (not present in Py2.7). statement = pipes.quote(statement) statement = "%s -c %s" % (shell, statement) process = subprocess.Popen(expandStatement( statement, ignore_pipe_errors=ignore_pipe_errors), cwd=PARAMS["workingdir"], shell=True, executable="/bin/bash", stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0 and not ignore_errors: raise OSError("---------------------------------------\n" "Child was terminated by signal %i: \n" "The stderr was: \n%s\n%s\n" "-----------------------------------------" % (-process.returncode, stderr, statement))
def peekParameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = getCallerLocals() # check if we should raise errors if on_error_raise is None: on_error_raise = not isTest() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} # Attempt to locate directory with pipeline source code. This is a # patch as pipelines might be called within the repository # directory or from an installed location dirname = PARAMS["pipelinedir"] # called without a directory, use current directory if dirname == "": dirname = os.path.abspath(".") else: # if not exists, assume we want version located # in directory of calling script. if not os.path.exists(dirname): # directory is path of calling script dirname = os.path.dirname(caller_locals['__file__']) pipeline = os.path.join(dirname, pipeline) if not os.path.exists(pipeline): if on_error_raise: raise ValueError("can't find pipeline at %s" % (pipeline)) else: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": workingdir = os.path.join(PARAMS.get("pipelinedir"), IOTools.snip(pipeline, ".py")) if not os.path.exists(workingdir): if on_error_raise: raise ValueError("can't find working dir %s" % workingdir) else: return {} statement = "python %s -f -v 0 dump" % pipeline process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): dump[key] = os.path.join(workingdir, value) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def getParameters(filenames=["pipeline.ini", ], defaults=None, site_ini=True, user_ini=True, default_ini=True, only_import=None): '''read a config file and return as a dictionary. Sections and keys are combined with an underscore. If a key without section does not exist, it will be added plain. For example:: [general] input=input1.file [special] input=input2.file will be entered as { 'general_input' : "input1.file", 'input: "input1.file", 'special_input' : "input2.file" } This function also updates the module-wide parameter map. The section [DEFAULT] is equivalent to [general]. The order of initialization is as follows: 1. hard-coded defaults 2. pipeline specific default file in the CGAT code installation 3. :file:`.cgat` in the users home directory 4. files supplied by the user in the order given If the same configuration value appears in multiple files, later configuration files will overwrite the settings form earlier files. Path names are expanded to the absolute pathname to avoid ambiguity with relative path names. Path names are updated for parameters that end in the suffix "dir" and start with a "." such as "." or "../data". Arguments --------- filenames : list List of filenames of the configuration files to read. defaults : dict Dictionary with default values. These will be overwrite any hard-coded parameters, but will be overwritten by user specified parameters in the configuration files. default_ini : bool If set, the default initialization file will be read from 'CGATPipelines/configuration/pipeline.ini' user_ini : bool If set, configuration files will also be read from a file called :file:`.cgat` in the user`s home directory. only_import : bool If set to a boolean, the parameter dictionary will be a defaultcollection. This is useful for pipelines that are imported (for example for documentation generation) but not executed as there might not be an appropriate .ini file available. If `only_import` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. Returns ------- config : dict Dictionary with configuration values. ''' global CONFIG global PARAMS old_id = id(PARAMS) caller_locals = getCallerLocals() # check if this is only for import if only_import is None: only_import = isTest() or \ "__name__" not in caller_locals or \ caller_locals["__name__"] != "__main__" # important: only update the PARAMS variable as # it is referenced in other modules. Thus the type # needs to be fixed at import. Raise error where this # is not the case. # Note: Parameter sharing in the Pipeline module needs # to be reorganized. if only_import: # turn on default dictionary TriggeredDefaultFactory.with_default = True # Clear up ini files on the list that do not exist. # Please note the use of list(filenames) to create # a clone to iterate over as we remove items from # the original list (to avoid unexpected results) for fn in list(filenames): if not os.path.exists(fn): filenames.remove(fn) if site_ini: # read configuration from /etc/cgat/pipeline.ini fn = "/etc/cgat/pipeline.ini" if os.path.exists(fn): filenames.insert(0, fn) if user_ini: # read configuration from a users home directory fn = os.path.join(os.path.expanduser("~"), ".cgat") if os.path.exists(fn): filenames.insert(0, fn) if default_ini: # The link between CGATPipelines and Pipeline.py # needs to severed at one point. # 1. config files into CGAT module directory? # 2. Pipeline.py into CGATPipelines module directory? filenames.insert(0, os.path.join(CGATPIPELINES_PIPELINE_DIR, 'configuration', 'pipeline.ini')) # IMS: Several legacy scripts call this with a string as input # rather than a list. Check for this and correct if isinstance(filenames, str): filenames = [filenames] PARAMS['pipeline_ini'] = filenames try: CONFIG.read(filenames) p = configToDictionary(CONFIG) except configparser.InterpolationSyntaxError as ex: # Do not log, as called before logging module is initialized - # this will mess up loging configuration in Control.py and Experiment.py # E.debug( # "InterpolationSyntaxError when reading configuration file, " # "likely due to use of '%'. " # "Please quote '%' if ini interpolation is required. " # "Orginal error: {}".format(str(ex))) CONFIG = configparser.RawConfigParser() CONFIG.read(filenames) p = configToDictionary(CONFIG) # update with hard-coded PARAMS PARAMS.update(HARDCODED_PARAMS) if defaults: PARAMS.update(defaults) PARAMS.update(p) # interpolate some params with other parameters for param in INTERPOLATE_PARAMS: try: PARAMS[param] = PARAMS[param] % PARAMS except TypeError as msg: raise TypeError('could not interpolate %s: %s' % (PARAMS[param], msg)) # expand pathnames for param, value in list(PARAMS.items()): if param.endswith("dir"): if value.startswith("."): PARAMS[param] = os.path.abspath(value) # make sure that the dictionary reference has not changed assert id(PARAMS) == old_id return PARAMS