def execute(statement, **kwargs): '''execute a statement locally. This method implements the same parameter interpolation as the function :func:`run`. Arguments --------- statement : string Command line statement to run. Returns ------- stdout : string Data sent to standard output by command stderr : string Data sent to standard error by command ''' if not kwargs: kwargs = get_caller_locals() kwargs = dict(list(get_params().items()) + list(kwargs.items())) logger = get_logger() logger.info("running %s" % (statement % kwargs)) if "cwd" not in kwargs: cwd = get_params()["work_dir"] else: cwd = kwargs["cwd"] # cleaning up of statement # remove new lines and superfluous spaces and tabs statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip() if statement.endswith(";"): statement = statement[:-1] # always use bash os.environ.update( {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')}) process = subprocess.Popen(statement % kwargs, cwd=cwd, shell=True, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=os.environ.copy(), executable="/bin/bash") # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( "Child was terminated by signal %i: \n" "The stderr was: \n%s\n%s\n" % (-process.returncode, stderr, statement)) return stdout, stderr
def __init__(self, **kwargs): self.logger = get_logger() self.queue_manager = None self.run_on_cluster = will_run_on_cluster(kwargs) self.job_threads = kwargs.get("job_threads", 1) if "job_memory" in kwargs and "job_total_memory" in kwargs: raise ValueError( "both job_memory and job_total_memory have been given") self.job_total_memory = kwargs.get('job_total_memory', None) self.job_memory = kwargs.get('job_memory', None) if self.job_total_memory == "unlimited" or self.job_memory == "unlimited": self.job_total_memory = self.job_memory = "unlimited" else: if self.job_total_memory: self.job_memory = iotools.bytes2human( iotools.human2bytes(self.job_total_memory) / self.job_threads) elif self.job_memory: self.job_total_memory = self.job_memory * self.job_threads else: self.job_memory = get_params()["cluster"].get( "memory_default", "4G") if self.job_memory == "unlimited": self.job_total_memory = "unlimited" else: self.job_total_memory = self.job_memory * self.job_threads self.ignore_pipe_errors = kwargs.get('ignore_pipe_errors', False) self.ignore_errors = kwargs.get('ignore_errors', False) self.job_name = kwargs.get("job_name", "unknow_job_name") self.task_name = kwargs.get("task_name", "unknown_task_name") # deduce output directory/directories, requires somewhat # consistent naming in the calling function. outfiles = [] if "outfile" in kwargs: outfiles.append(kwargs["outfile"]) if "outfiles" in kwargs: outfiles.extend(kwargs["outfiles"]) self.output_directories = set( sorted([os.path.dirname(x) for x in outfiles])) self.options = kwargs self.work_dir = get_params()["work_dir"] self.shellfile = kwargs.get("shell_logfile", None) if self.shellfile: if not self.shellfile.startswith(os.sep): self.shellfile = os.path.join(self.work_dir, os.path.basename(self.shellfile))
def file_is_mounted(filename): """return True if filename is mounted. A file is likely to be mounted if it is located inside a subdirectory of the local scratch directory. """ if get_params()["mount_point"]: return os.path.abspath(filename).startswith(get_params()["mount_point"]) else: return False
def get_temp_file(dir=None, shared=False, suffix="", mode="w+", encoding="utf-8"): '''get a temporary file. The file is created and the caller needs to close and delete the temporary file once it is not used any more. By default, the file is opened as a text file (mode ``w+``) with encoding ``utf-8`` instead of the default mode ``w+b`` used in :class:`tempfile.NamedTemporaryFile` If dir does not exist, it will be created. Arguments --------- dir : string Directory of the temporary file and if not given is set to the default temporary location in the global configuration dictionary. shared : bool If set, the tempory file will be in a shared temporary location (given by the global configuration directory). suffix : string Filename suffix Returns ------- file : File A file object of the temporary file. ''' if dir is None: if shared: dir = get_params()['shared_tmpdir'] else: dir = get_params()['tmpdir'] if not os.path.exists(dir): try: os.makedirs(dir) except OSError: # avoid race condition when several processes try to create # temporary directory. pass if not os.path.exists(dir): raise OSError( "temporary directory {} could not be created".format(dir)) return tempfile.NamedTemporaryFile(dir=dir, delete=False, prefix="ctmp", mode=mode, encoding=encoding, suffix=suffix)
def interpolate_statement(statement, kwargs): '''interpolate command line statement with parameters The skeleton of the statement should be defined in kwargs. The method then applies string interpolation using a dictionary built from the global configuration dictionary PARAMS, but augmented by `kwargs`. The latter takes precedence. Arguments --------- statement: string Command line statement to be interpolated. kwargs : dict Keyword arguments that are used for parameter interpolation. Returns ------- statement : string The command line statement with interpolated parameters. Raises ------ KeyError If ``statement`` contains unresolved references. ''' local_params = substitute_parameters(**kwargs) # build the statement try: statement = statement % local_params except KeyError as msg: raise KeyError( "Error when creating command: could not " "find %s in dictionaries" % msg) except ValueError as msg: raise ValueError( "Error when creating command: %s, statement = %s" % ( msg, statement)) # cleaning up of statement # remove new lines and superfluous spaces and tabs statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip() if statement.endswith(";"): statement = statement[:-1] # mark arvados mount points in statement if get_params().get("mount_point", None): statement = re.sub(get_params()["mount_point"], "arv=", statement) return statement
def get_database_name(): '''Return the database name associated with the pipeline. This method lookis in different sections in the ini file to permit both old style ``database`` and new style ``database_name``. This method has been implemented for backwards compatibility. Returns ------- databasename : string database name. Returns empty string if not found. Raises ------ KeyError If no database name is found ''' locations = ["database_name", "database"] params = get_params() for location in locations: database = params.get(location, None) if database is not None: return database raise KeyError("database name not found")
def setup_logging(options, pipeline=None): logger = logging.getLogger("cgatcore.pipeline") if options.log_config_filename is None: # set up default file logger handler = logging.FileHandler(filename=options.pipeline_logfile, mode="a") if pipeline is not None: pipeline_name = pipeline.name else: pipeline_name = get_params().get("pipeline_name", "main") handler.setFormatter( E.MultiLineFormatter("%(asctime)s %(levelname)s " "%(app_name)s %(module)s " "- %(message)s")) logger.addFilter(LoggingFilterpipelineName(name=pipeline_name)) logger.addHandler(handler) logger.info("pipeline log is {}".format(options.pipeline_logfile)) return logger
def connect(): """connect to SQLite database used in this pipeline. .. note:: This method is currently only implemented for sqlite databases. It needs refactoring for generic access. Alternatively, use an full or partial ORM. If ``annotations_database`` is in params, this method will attach the named database as ``annotations``. Returns ------- dbh a database handle """ # Note that in the future this might return an sqlalchemy or # db.py handle. url = get_params()["database"]["url"] is_sqlite3 = url.startswith("sqlite") if is_sqlite3: connect_args = {'check_same_thread': False} else: connect_args = {} creator = None if is_sqlite3 and "annotations_dir" in get_params(): # not sure what the correct way is for url # sqlite:///./csvdb -> ./csvdb # sqlite:////path/to/csvdb -> /path/to/csvdb filename = os.path.abspath(url[len("sqlite:///"):]) def creator(): conn = sqlite3.connect(filename) conn.execute("ATTACH DATABASE '{}' as annotations".format( os.path.join(get_params()["annotations_dir"], "csvdb"))) return conn engine = sqlalchemy.create_engine( url, connect_args=connect_args, creator=creator) return engine
def get_temp_dir(dir=None, shared=False, clear=False): '''get a temporary directory. The directory is created and the caller needs to delete the temporary directory once it is not used any more. If dir does not exist, it will be created. Arguments --------- dir : string Directory of the temporary directory and if not given is set to the default temporary location in the global configuration dictionary. shared : bool If set, the tempory directory will be in a shared temporary location. Returns ------- filename : string Absolute pathname of temporary file. ''' if dir is None: if shared: dir = get_params()['shared_tmpdir'] else: dir = get_params()['tmpdir'] if not os.path.exists(dir): os.makedirs(dir) tmpdir = tempfile.mkdtemp(dir=dir, prefix="ctmp") if clear: os.rmdir(tmpdir) return tmpdir
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = get_params().get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = iotools.open_file(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = iotools.open_file(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = iotools.zap_file(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write( "%s\t%s\t%s\t%s\n" % (fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) get_logger().info("zapped: %s" % (c)) outfile.close() return c
def build_load_statement(tablename, retry=True, options=""): """build a command line statement to upload data. Upload is performed via the :doc:`csv2db` script. The returned statement is suitable to use in pipe expression. This method is aware of the configuration values for database access and the chosen database backend. For example:: load_statement = P.build_load_statement("data") statement = "cat data.txt | %(load_statement)s" P.run(statement) Arguments --------- tablename : string Tablename for upload retry : bool Add the ``--retry`` option to `csv2db.py` options : string Command line options to be passed on to `csv2db.py` Returns ------- string """ opts = [] if retry: opts.append(" --retry ") params = get_params() opts.append("--database-url={}".format(params["database"]["url"])) db_options = " ".join(opts) load_statement = ( "python -m cgatcore.csv2db {db_options} {options} --table={tablename}". format(**locals())) return load_statement
def print_config_files(): ''' Print the list of .ini files used to configure the pipeline along with their associated priorities. Priority 1 is the highest. ''' filenames = get_params()['pipeline_yml'] print("\n List of .yml files used to configure the pipeline") s = len(filenames) if s == 0: print(" No yml files passed!") elif s >= 1: print(" %-11s: %s " % ("Priority", "File")) for f in filenames: if s == 1: print(" (highest) %s: %s\n" % (s, f)) else: print(" %-11s: %s " % (s, f)) s -= 1
def get_mounted_location(filename): """return location of filename within mounted directory """ return os.path.abspath(filename)[len(get_params()["mount_point"]):]
def peek_parameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = get_caller_locals() # check if we should raise errors if on_error_raise is None: on_error_raise = not is_test() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if ("config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!"): workingdir = os.path.join(get_params()["pipelinedir"], "pipeline_" + pipeline) if not os.path.exists(workingdir): if on_error_raise: raise ValueError("can't find working dir %s" % workingdir) else: return {} statement = "cgatflow {} -v 0 dump".format(pipeline) os.environ.update( {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')}) process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): if isinstance(value, str): dump[key] = os.path.join(workingdir, value) elif isinstance(value, collections.Mapping): for kkey, vvalue in list(value.items()): value[key] = os.path.join(workingdir, vvalue) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def run_workflow(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- pipeline: object pipeline to run. If not given, all ruffus pipelines are run. """ logger = logging.getLogger("cgatcore.pipeline") if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = ruffus.pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(get_params()["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( get_params()["tmpdir"])) try: os.makedirs(get_params()["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(get_params()["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(get_params(), sys.argv[0]) elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": if not options.without_cluster and not HAS_DRMAA and not get_params( )['testing']: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. opts = {"multithread": options.multiprocess} else: # use cooperative multitasking instead of multiprocessing. opts = { "multiprocess": options.multiprocess, "pool_manager": "gevent" } # create the session proxy start_session() logger.info("current directory is {}".format(os.getcwd())) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, **opts) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(get_params())) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") p = get_params() for k in sorted(get_params()): print(k, "=", p[k]) print_config_files() elif options.pipeline_action == "config": # Level needs to be 2: # 0th level -> cgatflow.py # 1st level -> Control.py # 2nd level -> pipeline_xyz.py f = sys._getframe(2) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
def initialize(argv=None, caller=None, defaults=None, **kwargs): """setup the pipeline framework. Arguments --------- options: object Container for command line arguments. args : list List of command line arguments. defaults : dictionary Dictionary with default values to be added to global parameters dictionary. Additional keyword arguments will be passed to the :func:`~.parse_commandline` function to set command-line defaults. """ if argv is None: argv = sys.argv # load default options from config files if caller: path = os.path.splitext(caller)[0] else: try: path = os.path.splitext(get_caller().__file__)[0] except AttributeError as ex: path = "unknown" options, args = parse_commandline(argv, **kwargs) get_parameters([ os.path.join(path, "pipeline.yml"), "../pipeline.yml", options.config_file ], defaults=defaults) global GLOBAL_OPTIONS global GLOBAL_ARGS GLOBAL_OPTIONS, GLOBAL_ARGS = options, args logger = logging.getLogger("cgatcore.pipeline") logger.info("started in directory: {}".format( get_params().get("start_dir"))) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. update_params_with_commandline_options(get_params(), options) code_location, version = get_version() logger.info("code location: {}".format(code_location)) logger.info("code version: {}".format(version)) logger.info("working directory is: {}".format( get_params().get("work_dir"))) work_dir = get_params().get("work_dir") if not os.path.exists(work_dir): E.info( "working directory {} does not exist - creating".format(work_dir)) os.makedirs(work_dir) logger.info("changing directory to {}".format(work_dir)) os.chdir(work_dir) logger.info("pipeline has been initialized") return options, args
def concatenate_and_load(infiles, outfile, regex_filename=None, header=None, cat="track", has_titles=True, missing_value="na", retry=True, tablename=None, options="", job_memory=None): """concatenate multiple tab-separated files and upload into database. The table name is given by outfile without the ".load" suffix. A typical concatenate and load task in ruffus would look like this:: @merge("*.tsv.gz", ".load") def loadData(infile, outfile): P.concatenateAndLoad(infiles, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. regex_filename : string If given, *regex_filename* is applied to the filename to extract the track name. If the pattern contains multiple groups, they are added as additional columns. For example, if `cat` is set to ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz`` it will add the columns ``track`` and method to the table. header : string Comma-separated list of values for header. cat : string Column title for column containing the track name. The track name is derived from the filename, see `regex_filename`. has_titles : bool If True, files are expected to have column titles in their first row. missing_value : string String to use for missing values. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. tablename: string Name to use for table. If unset derive from outfile. options : string Command line options for the `csv2db.py` script. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ if job_memory is None: job_memory = get_params()["cluster_memory_default"] if tablename is None: tablename = to_table(outfile) infiles = " ".join(infiles) passed_options = options load_options, cat_options = ["--add-index=track"], [] if regex_filename: cat_options.append("--regex-filename='%s'" % regex_filename) if header: load_options.append("--header-names=%s" % header) if not has_titles: cat_options.append("--no-titles") cat_options = " ".join(cat_options) load_options = " ".join(load_options) + " " + passed_options load_statement = build_load_statement(tablename, options=load_options, retry=retry) statement = '''python -m cgatcore.tables --cat=%(cat)s --missing-value=%(missing_value)s %(cat_options)s %(infiles)s | %(load_statement)s > %(outfile)s''' to_cluster = False run(statement)
def build_job_script(self, statement): '''build job script from statement. returns (name_of_script, stdout_path, stderr_path) ''' tmpfilename = get_temp_filename(dir=self.work_dir, clear=True) tmpfilename = tmpfilename + ".sh" expanded_statement, cleanup_funcs = self.expand_statement(statement) with open(tmpfilename, "w") as tmpfile: # disabled: -l -O expand_aliases\n" ) # make executable tmpfile.write("#!/bin/bash -eu\n") if not self.ignore_pipe_errors: tmpfile.write("set -o pipefail\n") os.chmod(tmpfilename, stat.S_IRWXG | stat.S_IRWXU) tmpfile.write("\ncd {}\n".format(self.work_dir)) if self.output_directories is not None: for outdir in self.output_directories: if outdir: tmpfile.write("\nmkdir -p {}\n".format(outdir)) # create and set system scratch dir for temporary files tmpfile.write("umask 002\n") cluster_tmpdir = get_params()["cluster_tmpdir"] if self.run_on_cluster and cluster_tmpdir: tmpdir = cluster_tmpdir tmpfile.write("TMPDIR=`mktemp -d -p {}`\n".format(tmpdir)) tmpfile.write("export TMPDIR\n") else: tmpdir = get_temp_dir(dir=get_params()["tmpdir"], clear=True) tmpfile.write("mkdir -p {}\n".format(tmpdir)) tmpfile.write("export TMPDIR={}\n".format(tmpdir)) cleanup_funcs.append( ("clean_temp", "{{ rm -rf {}; }}".format(tmpdir))) # output times whenever script exits, preserving # return status cleanup_funcs.append( ("info", "{ echo 'benchmark'; hostname; times; }")) for cleanup_func, cleanup_code in cleanup_funcs: tmpfile.write("\n{}() {}\n".format(cleanup_func, cleanup_code)) tmpfile.write("\nclean_all() {{ {}; }}\n".format("; ".join( [x[0] for x in cleanup_funcs]))) tmpfile.write("\ntrap clean_all EXIT\n\n") if self.job_memory not in("unlimited", "etc") and \ self.options.get("cluster_memory_ulimit", False): # restrict virtual memory # Note that there are resources in SGE which could do this directly # such as v_hmem. # Note that limiting resident set sizes (RSS) with ulimit is not # possible in newer kernels. # -v and -m accept memory in kb requested_memory_kb = max( 1000, int( math.ceil( iotools.human2bytes(self.job_memory) / 1024 * self.job_threads))) # unsetting error exit as often not permissions tmpfile.write("set +e\n") tmpfile.write( "ulimit -v {} > /dev/null \n".format(requested_memory_kb)) tmpfile.write( "ulimit -m {} > /dev/null \n".format(requested_memory_kb)) # set as hard limit tmpfile.write("ulimit -H -v > /dev/null \n") tmpfile.write("set -e\n") if self.shellfile: # make sure path exists that we want to write to tmpfile.write("mkdir -p $(dirname \"{}\")\n".format( self.shellfile)) # output low-level debugging information to a shell log file tmpfile.write('echo "%s : START -> %s" >> %s\n' % (self.job_name, tmpfilename, self.shellfile)) # disabled - problems with quoting # tmpfile.write( '''echo 'statement=%s' >> %s\n''' % # (shellquote(statement), self.shellfile) ) tmpfile.write("set | sed 's/^/%s : /' >> %s\n" % (self.job_name, self.shellfile)) tmpfile.write("pwd | sed 's/^/%s : /' >> %s\n" % (self.job_name, self.shellfile)) tmpfile.write("hostname | sed 's/^/%s: /' >> %s\n" % (self.job_name, self.shellfile)) # cat /proc/meminfo is Linux specific if get_params()['os'] == 'Linux': tmpfile.write( "cat /proc/meminfo | sed 's/^/%s: /' >> %s\n" % (self.job_name, self.shellfile)) elif get_params()['os'] == 'Darwin': tmpfile.write("vm_stat | sed 's/^/%s: /' >> %s\n" % (self.job_name, self.shellfile)) tmpfile.write('echo "%s : END -> %s" >> %s\n' % (self.job_name, tmpfilename, self.shellfile)) tmpfile.write("ulimit | sed 's/^/%s: /' >> %s\n" % (self.job_name, self.shellfile)) job_path = os.path.abspath(tmpfilename) tmpfile.write(expanded_statement) tmpfile.write("\n\n") tmpfile.close() return statement, job_path
def creator(): conn = sqlite3.connect(filename) conn.execute("ATTACH DATABASE '{}' as annotations".format( os.path.join(get_params()["annotations_dir"], "csvdb"))) return conn
def submit(module, function, args=None, infiles=None, outfiles=None, to_cluster=True, logfile=None, job_options="", job_threads=1, job_memory=False): '''submit a python *function* as a job to the cluster. This method runs the script :file:`run_function` using the :func:`run` method in this module thus providing the same control options as for command line tools. Arguments --------- module : string Module name that contains the function. If `module` is not part of the PYTHONPATH, an absolute path can be given. function : string Name of function to execute infiles : string or list Filenames of input data outfiles : string or list Filenames of output data logfile : filename Logfile to provide to the ``--log`` option job_options : string String for generic job options for the queuing system job_threads : int Number of slots (threads/cores/CPU) to use for the task job_memory : string Amount of memory to reserve for the job. ''' if not job_memory: job_memory = get_params().get("cluster_memory_default", "2G") if type(infiles) in (list, tuple): infiles = " ".join(["--input=%s" % x for x in infiles]) else: infiles = "--input=%s" % infiles if type(outfiles) in (list, tuple): outfiles = " ".join(["--output-section=%s" % x for x in outfiles]) else: outfiles = "--output-section=%s" % outfiles if logfile: logfile = "--log=%s" % logfile else: logfile = "" if args: args = "--args=%s" % ",".join(args) else: args = "" statement = ("python -m cgatcore.pipeline.run_function " "--module=%(module)s " "--function=%(function)s " "%(logfile)s " "%(infiles)s " "%(outfiles)s " "%(args)s") run(statement)
def run(statement, **kwargs): """run a command line statement. This function runs a single or multiple statements either locally or on the cluster using drmaa. How a statement is executed or how it is modified depends on the context. The context is provided by keyword arguments provided as named function arguments ('kwargs') but also from defaults (see below). The following keyword arguments are recognized: job_memory memory to use for the job per thread. Memory specification should be in a format that is accepted by the job scheduler. Note that memory is per thread. If you have 6 threads and the total memory is 6Gb, use 1G as job_memory. job_total_memory total memory to use for a job. This will be divided by the number of threads. job_threads number of threads to request for the job. job_options options to the job scheduler. job_condaenv conda environment to use for the job. job_array if set, run statement as an array job. Job_array should be tuple with start, end, and increment. In addition, any additional variables will be used to interpolate the command line string using python's '%' string interpolation operator. The context is build in a hierarchical manner with successive operations overwriting previous values. 1. Global variables The context is initialized with system-wide defaults stored in the global PARAMS singleton. 2. Context of caller The context of the calling function is examined and any local variables defined in this context are added. 3. kwargs Any options given explicitely as options to the run() method are added. 4. params If the context of the calling function contains a params variable, its contents are added to the context. This permits setting variables in configuration files in TaskLibrary functions. By default, a job is sent to the cluster, unless: * ``to_cluster`` is present and set to None. * ``without_cluster`` is True. * ``--local`` has been specified on the command line and the option ``without_cluster`` has been set as a result. * no libdrmaa is present * the global session is not initialized (GLOBAL_SESSION is None) Troubleshooting: 1. DRMAA creates sessions and their is a limited number of sessions available. If there are two many or sessions become not available after failed jobs, use ``qconf -secl`` to list sessions and ``qconf -kec #`` to delete sessions. 2. Memory: 1G of free memory can be requested using the job_memory variable: ``job_memory = "1G"`` If there are error messages like "no available queue", then the problem could be that a particular complex attribute has not been defined (the code should be ``hc`` for ``host:complex`` and not ``hl`` for ``host:local``. Note that qrsh/qsub directly still works. The job will be executed within PARAMS["work_dir"], unless PARAMS["work_dir"] is not local. In that case, the job will be executed in a shared temporary directory. Arguments --------- statement : string or list of strings A command line statement or a list of command line statements to be executed. kwargs : dictionary Context for job. The context is used to interpolate the command line statement. """ logger = get_logger() # combine options using priority options = dict(list(get_params().items())) caller_options = get_caller_locals() options.update(list(caller_options.items())) if "self" in options: del options["self"] options.update(list(kwargs.items())) # inject params named tuple from TaskLibrary functions into option # dict. This allows overriding options set in the code with options set # in a .yml file if "params" in options: try: options.update(options["params"]._asdict()) except AttributeError: pass # insert parameters supplied through simplified interface such # as job_memory, job_options, job_queue options['cluster']['options'] = options.get('job_options', options['cluster']['options']) options['cluster']['queue'] = options.get('job_queue', options['cluster']['queue']) options['without_cluster'] = options.get('without_cluster') # SGE compatible job_name name_substrate = str(options.get("outfile", "cgatcore")) if os.path.basename(name_substrate).startswith("result"): name_substrate = os.path.basename(os.path.dirname(name_substrate)) else: name_substrate = os.path.basename(name_substrate) options["job_name"] = re.sub("[:]", "_", name_substrate) try: calling_module = get_caller().__name__ except AttributeError: calling_module = "unknown" options["task_name"] = calling_module + "." + get_calling_function() # build statements using parameter interpolation if isinstance(statement, list): statement_list = [] for stmt in statement: statement_list.append(interpolate_statement(stmt, options)) else: statement_list = [interpolate_statement(statement, options)] if len(statement_list) == 0: logger.warn("no statements found - no execution") return [] if options.get("dryrun", False): for statement in statement_list: logger.info("dry-run: {}".format(statement)) return [] # execute statement list runner = make_runner(**options) with runner as r: benchmark_data = r.run(statement_list) # log benchmark_data for data in benchmark_data: logger.info(json.dumps(data)) BenchmarkData = collections.namedtuple('BenchmarkData', sorted(benchmark_data[0])) return [BenchmarkData(**d) for d in benchmark_data]
def load(infile, outfile=None, options="", collapse=False, transpose=False, tablename=None, retry=True, limit=0, shuffle=False, job_memory=None): """import data from a tab-separated file into database. The table name is given by outfile without the ".load" suffix. A typical load task in ruffus would look like this:: @transform("*.tsv.gz", suffix(".tsv.gz"), ".load") def loadData(infile, outfile): P.load(infile, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infile : string Filename of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile` if `tablename` is not set. options : string Command line options for the `csv2db.py` script. collapse : string If set, the table will be collapsed before loading. This transforms a data set with two columns where the first column is the row name into a multi-column table. The value of collapse is the value used for missing values. transpose : string If set, the table will be transposed before loading. The first column in the first row will be set to the string within transpose. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. limit : int If set, only load the first n lines. shuffle : bool If set, randomize lines before loading. Together with `limit` this permits loading a sample of rows. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ if job_memory is None: job_memory = get_params()["cluster_memory_default"] if not tablename: tablename = to_table(outfile) statement = [] if infile.endswith(".gz"): statement.append("zcat %(infile)s") else: statement.append("cat %(infile)s") if collapse: statement.append("python -m cgatcore.table " "--log=%(outfile)s.collapse.log " "--collapse=%(collapse)s") if transpose: statement.append("python -m cgatcore.table " "--log=%(outfile)s.transpose.log " "--transpose " "--set-transpose-field=%(transpose)s") if shuffle: statement.append("python -m cgatcore.table " "--log=%(outfile)s.shuffle.log " "--method=randomize-rows") if limit > 0: # use awk to filter in order to avoid a pipeline broken error from head statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1)) # ignore errors from cat or zcat due to broken pipe ignore_pipe_errors = True statement.append( build_load_statement(tablename, options=options, retry=retry)) statement = " | ".join(statement) + " > %(outfile)s" to_cluster = False run(statement)