def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", s_host, s_user)) rsync_str = str(GLOBAL_CFG.get_host_item( "retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if cylc.flags.debug: cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size,)) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append(ctx.user_at_host + ":" + GLOBAL_CFG.get_derived_host_item( schd_ctx.suite, "suite job log directory", s_host, s_user) + "/") # Local target cmd.append(GLOBAL_CFG.get_derived_host_item( schd_ctx.suite, "suite job log directory") + "/") self.proc_pool.put_command( SuiteProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def get_scan_items_from_fs(owner_pattern=None, updater=None): """Get list of host:port available to scan using the file system. Walk users' "~/cylc-run/" to get (host, port) from ".service/contact" for active suites. Return (list): List of (host, port) available for scan. """ srv_files_mgr = SuiteSrvFilesManager() if owner_pattern is None: # Run directory of current user only run_dirs = [(GLOBAL_CFG.get_host_item('run directory'), None)] else: # Run directory of all users matching "owner_pattern". # But skip those with /nologin or /false shells run_dirs = [] skips = ('/false', '/nologin') for pwent in getpwall(): if any(pwent.pw_shell.endswith(s) for s in (skips)): continue if owner_pattern.match(pwent.pw_name): run_dirs.append( (GLOBAL_CFG.get_host_item('run directory', owner=pwent.pw_name, owner_home=pwent.pw_dir), pwent.pw_name)) if cylc.flags.debug: sys.stderr.write( 'Listing suites:%s%s\n' % (DEBUG_DELIM, DEBUG_DELIM.join(item[1] for item in run_dirs if item[1] is not None))) items = [] for run_d, owner in run_dirs: for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): if updater and updater.quit: return # Always descend for top directory, but # don't descend further if it has a: # * .service/ or log/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (srv_files_mgr.DIR_BASE_SRV in dnames or 'log' in dnames or 'cylc-suite.db' in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) try: contact_data = srv_files_mgr.load_contact_file(reg, owner) except (SuiteServiceFileError, IOError, TypeError, ValueError): continue else: items.append((contact_data[srv_files_mgr.KEY_HOST], contact_data[srv_files_mgr.KEY_PORT])) return items
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (USER, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, CylcSuiteEnv.BASE_NAME)] if 'CYLC_SUITE_DEF_PATH' in os.environ: sources.append( os.path.join(os.getenv('CYLC_SUITE_DEF_PATH'), 'passphrase')) suite_run_py = os.path.join(suite_run_dir, 'python') if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log( INFO, 'Initialising %s:%s' % (user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item('remote shell template', host, owner).replace(' %s', '') scp_tmpl = GLOBAL_CFG.get_host_item('remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ "-n", user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir ] cmd2 = shlex.split(scp_tmpl) + ['-pr'] + sources + [ user_at_host + ":" + r_suite_run_dir + '/' ] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err) self.initialised_hosts.append(user_at_host)
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (USER, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, CylcSuiteEnv.BASE_NAME)] if 'CYLC_SUITE_DEF_PATH' in os.environ: sources.append( os.path.join(os.getenv('CYLC_SUITE_DEF_PATH'), 'passphrase')) suite_run_py = os.path.join(suite_run_dir, 'python') if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(' %s', '') scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ "-n", user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ['-pr'] + sources + [ user_at_host + ":" + r_suite_run_dir + '/'] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err) self.initialised_hosts.append(user_at_host)
def write_environment_1( self, BUFFER=None ): if not BUFFER: BUFFER = self.FILE BUFFER.write( "\n\n# CYLC SUITE ENVIRONMENT:" ) # write the static suite variables for var, val in sorted(self.__class__.suite_env.items()): BUFFER.write( "\nexport " + var + "=" + str(val) ) if str(self.__class__.suite_env.get('CYLC_UTC')) == 'True': BUFFER.write( "\nexport TZ=UTC" ) BUFFER.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite work directory', self.host, self.owner ) st_env = deepcopy( self.__class__.suite_task_env ) st_env[ 'CYLC_SUITE_RUN_DIR' ] = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite run directory', self.host, self.owner ) st_env[ 'CYLC_SUITE_WORK_DIR' ] = suite_work_dir st_env[ 'CYLC_SUITE_SHARE_DIR' ] = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite share directory', self.host, self.owner ) st_env[ 'CYLC_SUITE_SHARE_PATH' ] = '$CYLC_SUITE_SHARE_DIR' # DEPRECATED rsp = self.jobconfig['remote suite path'] if rsp: st_env[ 'CYLC_SUITE_DEF_PATH' ] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env[ 'CYLC_SUITE_DEF_PATH' ] = re.sub( os.environ['HOME'], '$HOME', st_env['CYLC_SUITE_DEF_PATH'] ) for var, val in sorted(st_env.items()): BUFFER.write( "\nexport " + var + "=" + str(val) ) task_work_dir = os.path.join( suite_work_dir, self.jobconfig['work sub-directory'] ) use_login_shell = GLOBAL_CFG.get_host_item( 'use login shell', self.host, self.owner ) comms = GLOBAL_CFG.get_host_item( 'task communication method', self.host, self.owner ) BUFFER.write( "\n\n# CYLC TASK ENVIRONMENT:" ) BUFFER.write( "\nexport CYLC_TASK_COMMS_METHOD=" + comms ) BUFFER.write( "\nexport CYLC_TASK_CYCLE_POINT=" + self.point_string ) BUFFER.write( "\nexport CYLC_TASK_CYCLE_TIME=" + self.point_string ) BUFFER.write( "\nexport CYLC_TASK_ID=" + self.task_id ) BUFFER.write( "\nexport CYLC_TASK_IS_COLDSTART=" + str( self.jobconfig['is cold-start']) ) BUFFER.write( "\nexport CYLC_TASK_LOG_ROOT=" + self.log_root ) BUFFER.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str( GLOBAL_CFG.get( ['task messaging','maximum number of tries'])) ) BUFFER.write( "\nexport CYLC_TASK_MSG_RETRY_INTVL=" + str( GLOBAL_CFG.get( ['task messaging','retry interval in seconds'])) ) BUFFER.write( "\nexport CYLC_TASK_MSG_TIMEOUT=" + str( GLOBAL_CFG.get( ['task messaging','connection timeout in seconds'])) ) BUFFER.write( "\nexport CYLC_TASK_NAME=" + self.task_name ) BUFFER.write( '\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + ' '.join( self.jobconfig['namespace hierarchy']) + '"') BUFFER.write( "\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell) ) BUFFER.write( "\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(self.jobconfig['absolute submit number']) ) BUFFER.write( "\nexport CYLC_TASK_TRY_NUMBER=" + str(self.jobconfig['try number']) ) BUFFER.write( "\nexport CYLC_TASK_WORK_DIR=" + task_work_dir ) BUFFER.write( "\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") # DEPRECATED
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (user, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, "cylc-suite-env")] suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) try: r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ user_at_host, 'mkdir -p "%s" "%s"' % (r_suite_run_dir, r_log_job_dir)] cmd2 = shlex.split(scp_tmpl) + ["-r"] + sources + [ user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: check_call(cmd) except Exception: raise RemoteJobHostInitError(user_at_host) self.initialised_hosts.append(user_at_host)
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_host_item('run directory') results = [] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (self.DIR_BASE_SRV in dnames or "cylc-suite.db" in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg) ]) except (IOError, SuiteServiceFileError) as exc: print >> sys.stderr, str(exc) return results
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for user_at_host, should_unlink in self.initialised_hosts.items(): if not should_unlink: continue if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT, ) cmd = shlex.split(ssh_tmpl) + ["-n", user_at_host, "rm", "-f", r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.items(): if not proc.poll(): continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) ) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() proc.wait() ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) )
def _remote_init_items(self, host, owner): """Return list of items that should be installed on task remote. Each item is (path, name), where name is relative path under suite run directory. """ items = [] comm_meth = GLOBAL_CFG.get_host_item('task communication method', host, owner) LOG.debug('comm_meth=%s' % comm_meth) if comm_meth in ['ssh', 'http', 'https']: # Contact file items.append( (self.suite_srv_files_mgr.get_contact_file(self.suite), os.path.join(self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT))) if comm_meth in ['http', 'https']: # Passphrase file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE))) if comm_meth in ['https']: # SSL cert file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_SSL_CERT))) return items
def _write_prelude(cls, handle, job_conf): """Job script prelude.""" if cylc.flags.debug: if 'bash' in job_conf['job script shell']: handle.write("\n\nPS4='+[\D{%Y%m%dT%H%M%S%z}]\u@\h '") handle.write('\n\nset -x') handle.write('\n\necho "JOB SCRIPT STARTING"') # set cylc version and source profile scripts before turning on # error trapping so that profile errors do not abort the job handle.write('\n\nprelude() {') keys = GLOBAL_CFG.get_host_item( 'copyable environment variables', job_conf['host'], job_conf['owner']) for key in keys + ['CYLC_DIR', 'CYLC_VERSION']: if key in os.environ: handle.write("\n export %s='%s'" % (key, os.environ[key])) handle.write( r''' for FILE_NAME in \ "${HOME}/.cylc/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env-default.sh" do if [[ -f "${FILE_NAME}" ]]; then . "${FILE_NAME}" 1>/dev/null 2>&1 break fi done } prelude''')
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_host_item('run directory') results = [] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and ( self.DIR_BASE_SRV in dnames or "cylc-suite.db" in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self._get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: print >> sys.stderr, str(exc) return results
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_host_item('run directory') results = [] skip_names = [ "log", "share", "work", self.DIR_BASE_SRV, self.FILE_BASE_SUITE_RC] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Don't descent further if it looks like a suite directory if any([name in dnames or name in fnames for name in skip_names]): dnames[:] = [] # Choose only suites with info file and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self._get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: print >> sys.stderr, str(exc) return results
def _write_prelude(cls, handle, job_conf): """Job script prelude.""" handle.write('\n\necho "JOB SCRIPT STARTING"') # set cylc version and source profile scripts before turning on # error trapping so that profile errors do not abort the job handle.write("\n\nprelude() {") keys = GLOBAL_CFG.get_host_item("copyable environment variables", job_conf["host"], job_conf["owner"]) for key in keys + ["CYLC_DIR", "CYLC_VERSION"]: if key in os.environ: handle.write("\n export %s='%s'" % (key, os.environ[key])) handle.write( r""" for FILE_NAME in \ "${HOME}/.cylc/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env-default.sh" do if [[ -f "${FILE_NAME}" ]]; then . "${FILE_NAME}" 1>/dev/null 2>&1 break fi done } prelude""" )
def _load_port_file(self): """Load port, host, etc from port file.""" # GLOBAL_CFG is expensive to import, so only load on demand from cylc.cfgspec.globalcfg import GLOBAL_CFG port_file_path = os.path.join( GLOBAL_CFG.get(['communication', 'ports directory']), self.suite) out = "" if is_remote_host(self.host) or is_remote_user(self.owner): # Only load these modules on demand, as they may be expensive import shlex from subprocess import Popen, PIPE ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err} if self.port is None: raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: if self.port is None: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() if self.port is None: try: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname()
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (user, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory") sources = [os.path.join(suite_run_dir, "cylc-suite-env")] if "CYLC_SUITE_DEF_PATH" in os.environ: sources.append(os.path.join(os.getenv("CYLC_SUITE_DEF_PATH"), "passphrase")) suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite job log directory", host, owner) getLogger("main").log(INFO, "Initialising %s:%s" % (user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmd1 = shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ["-pr"] + sources + [user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err ) self.initialised_hosts.append(user_at_host)
def _write_init_script(cls, handle, job_conf): """Init-script.""" global_init_script = GLOBAL_CFG.get_host_item("global init-script", job_conf["host"], job_conf["owner"]) if global_init_script: handle.write("\n\n# GLOBAL INIT-SCRIPT:\n") handle.write(global_init_script) if not job_conf["init-script"]: return handle.write("\n\n# INIT-SCRIPT:\n") handle.write(job_conf["init-script"])
def _write_initial_scripting(cls, handle, job_conf): """Initial scripting.""" global_initial_scripting = GLOBAL_CFG.get_host_item( 'global initial scripting', job_conf["host"], job_conf["owner"]) if global_initial_scripting: handle.write("\n\n# GLOBAL INITIAL SCRIPTING:\n") handle.write(global_initial_scripting) if not job_conf['initial scripting']: return handle.write("\n\n# INITIAL SCRIPTING:\n") handle.write(job_conf['initial scripting'])
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('remote shell template', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def _write_init_script(cls, handle, job_conf): """Init-script.""" global_init_script = GLOBAL_CFG.get_host_item( 'global init-script', job_conf["host"], job_conf["owner"]) if global_init_script: handle.write("\n\n# GLOBAL INIT-SCRIPT:\n") handle.write(global_init_script) if not job_conf['init-script']: return handle.write("\n\n# INIT-SCRIPT:\n") handle.write(job_conf['init-script'])
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.initialised.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('remote shell template', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory', host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file ] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def get_host_conf(self, itask, key, default=None, skey="remote"): """Return a host setting from suite then global configuration.""" overrides = self.broadcast_mgr.get_broadcast(itask.identity) if skey in overrides and overrides[skey].get(key) is not None: return overrides[skey][key] elif itask.tdef.rtconfig[skey].get(key) is not None: return itask.tdef.rtconfig[skey][key] else: try: return GLOBAL_CFG.get_host_item( key, itask.task_host, itask.task_owner) except (KeyError, ItemNotFoundError): pass return default
def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def set_host(self, host, set_timer=False): """Set the task host. The polling comms method is host-specific. """ if GLOBAL_CFG.get_host_item('task communication method', host) == "poll": if not self.intervals: self.intervals = copy(self.default_intervals) self.log( WARNING, '(polling comms) using default %s polling intervals' % self.name) if set_timer: self.set_timer()
def set_host(self, host, set_timer=False): """Set the task host. The polling comms method is host-specific. """ if GLOBAL_CFG.get_host_item( 'task communication method', host) == "poll": if not self.intervals: self.intervals = copy(self.default_intervals) self.log( WARNING, '(polling comms) using default %s polling intervals' % self.name ) if set_timer: self.set_timer()
def _load_passphrase_via_ssh(self, suite, owner, host): """Load passphrase from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-PASSPHRASE] %(suite)s ' % {'suite': suite} # Extract suite definition directory from remote ~/.cylc/REGDB/SUITE # Attempt to cat passphrase file under suite definition directory script = ( r'''echo -n '%(prefix)s'; ''' r'''sed -n 's/^path=//p' '.cylc/REGDB/%(suite)s' | ''' r'''xargs -I '{}' cat '{}/passphrase'; ''' r'''echo''' ) % {'prefix': prefix, 'suite': suite} ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', host, owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') # back compat command = shlex.split(ssh_tmpl) + ['-n', owner + '@' + host, script] try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix passphrase = None for line in out.splitlines(): if line.startswith(prefix): passphrase = line.replace(prefix, '').strip() if not passphrase or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return passphrase
def execute(self, force_required=False, env=None, path=None, dry_run=False): """Execute command on remote host. Returns False if remote re-invocation is not needed, True if it is needed and executes successfully otherwise aborts. """ if not self.is_remote: return False from cylc.cfgspec.globalcfg import GLOBAL_CFG from cylc.version import CYLC_VERSION name = os.path.basename(self.argv[0])[5:] # /path/to/cylc-foo => foo user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' # Build the remote command # ssh command and options (X forwarding) ssh_tmpl = str(GLOBAL_CFG.get_host_item( "remote shell template", self.host, self.owner)).replace(" %s", "") command = shlex.split(ssh_tmpl) + ["-Y", user_at_host] # Use bash -l? ssh_login_shell = self.ssh_login_shell if ssh_login_shell is None: ssh_login_shell = GLOBAL_CFG.get_host_item( "use login shell", self.host, self.owner) # Pass cylc version through. command += ["env", "CYLC_VERSION=%s" % CYLC_VERSION] if ssh_login_shell: # A login shell will always source /etc/profile and the user's bash # profile file. To avoid having to quote the entire remote command # it is passed as arguments to the bash script. command += ["bash", "--login", "-c", "'exec $0 \"$@\"'"] # "cylc" on the remote host if path: command.append(os.sep.join(path + ["cylc"])) else: command.append(GLOBAL_CFG.get_host_item( "cylc executable", self.host, self.owner)) command.append(name) if env is None: env = {} for var, val in env.iteritems(): command.append("--env=%s=%s" % (var, val)) for arg in self.args: command.append("'" + arg + "'") # above: args quoted to avoid interpretation by the shell, # e.g. for match patterns such as '.*' on the command line. if cylc.flags.verbose: # Wordwrap the command, quoting arguments so they can be run # properly from the command line command_str = ' '.join([quote(arg) for arg in command]) print '\n'.join( TextWrapper(subsequent_indent='\t').wrap(command_str)) if dry_run: return command try: popen = subprocess.Popen(command) except OSError as exc: sys.exit("ERROR: remote command invocation failed %s" % str(exc)) res = popen.wait() if WIFSIGNALED(res): sys.exit("ERROR: remote command terminated by signal %d" % res) elif res: sys.exit("ERROR: remote command failed %d" % res) else: return True
def run(self): """Invoke the tailer.""" command = [] if ":" in self.filename: # remote user_at_host, filename = self.filename.split(':') if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = (None, user_at_host) ssh = str(GLOBAL_CFG.get_host_item( "remote shell template", host, owner)).replace(" %s", "") command = shlex.split(ssh) + ["-n", user_at_host] cmd_tmpl = str(GLOBAL_CFG.get_host_item( "remote tail command template", host, owner)) else: filename = self.filename cmd_tmpl = str(GLOBAL_CFG.get_host_item( "local tail command template")) if self.cmd_tmpl: cmd_tmpl = self.cmd_tmpl command += shlex.split(cmd_tmpl % {"filename": filename}) try: self.proc = Popen( command, stdout=PIPE, stderr=STDOUT, preexec_fn=os.setpgrp) except OSError as exc: # E.g. ssh command not found dialog = warning_dialog("%s: %s" % ( exc, " ".join([quote(item) for item in command]))) gobject.idle_add(dialog.warn) return poller = select.poll() poller.register(self.proc.stdout.fileno()) buf = "" while not self.quit and self.proc.poll() is None: try: self.pollable.poll() except (TypeError, AttributeError): pass if self.freeze or not poller.poll(100): # 100 ms timeout sleep(1) continue # Both self.proc.stdout.read(SIZE) and self.proc.stdout.readline() # can block. However os.read(FILENO, SIZE) should be fine after a # poller.poll(). try: data = os.read(self.proc.stdout.fileno(), self.READ_SIZE) except (IOError, OSError) as exc: dialog = warning_dialog("%s: %s" % ( exc, " ".join([quote(item) for item in command]))) gobject.idle_add(dialog.warn) break if data: # Manage buffer, only add full lines to display to ensure # filtering and tagging work for line in data.splitlines(True): if not line.endswith("\n"): buf += line continue elif buf: line = buf + line buf = "" if (not self.filters or all([re.search(f, line) for f in self.filters])): gobject.idle_add(self.update_gui, line) sleep(0.01) self.stop()
def __init__(self, task_id, suite, jobconfig, submit_num): self.jobconfig = jobconfig self.task_id = task_id self.suite = suite self.logfiles = jobconfig.get('log files') self.command = None self.job_submit_command_template = jobconfig.get('command template') common_job_log_path = jobconfig.get('common job log path') self.local_jobfile_path = jobconfig.get('local job file path') self.logfiles.add_path(self.local_jobfile_path) task_host = jobconfig.get('task host') task_owner = jobconfig.get('task owner') self.remote_shell_template = GLOBAL_CFG.get_host_item( 'remote shell template', task_host, task_owner) if is_remote_host(task_host) or is_remote_user(task_owner): self.local = False if task_owner: self.task_owner = task_owner else: self.task_owner = None if task_host: self.task_host = task_host else: self.task_host = socket.gethostname() remote_job_log_dir = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite job log directory', self.task_host, self.task_owner) remote_jobfile_path = os.path.join(remote_job_log_dir, common_job_log_path) # Remote log files self.stdout_file = remote_jobfile_path + ".out" self.stderr_file = remote_jobfile_path + ".err" # Used in command construction: self.jobfile_path = remote_jobfile_path # Record paths of remote log files for access by gui if True: # by ssh URL url_prefix = self.task_host if self.task_owner: url_prefix = self.task_owner + "@" + url_prefix self.logfiles.add_path(url_prefix + ':' + self.stdout_file) self.logfiles.add_path(url_prefix + ':' + self.stderr_file) else: # CURRENTLY DISABLED: # If the remote and suite hosts see a common filesystem, or # if the remote task is really just a local task with a # different owner, we could just use local filesystem access. # But to use this: (a) special namespace config would be # required to indicate we have a common filesystem, and # (b) we'd need to consider how the log directory can be # specified (for example use of '$HOME' as for remote # task use would not work here as log file access is by # gui under the suite owner account. self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) else: # LOCAL TASKS self.local = True self.task_owner = None # Used in command construction: self.jobfile_path = self.local_jobfile_path # Local stdout and stderr log file paths: self.stdout_file = self.local_jobfile_path + ".out" self.stderr_file = self.local_jobfile_path + ".err" # interpolate environment variables in extra logs for idx in range(0, len(self.logfiles.paths)): self.logfiles.paths[idx] = expandvars(self.logfiles.paths[idx]) # Record paths of local log files for access by gui self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) # set some defaults that can be overridden by derived classes self.jobconfig['directive prefix'] = None self.jobconfig['directive final'] = "# FINAL DIRECTIVE" self.jobconfig['directive connector'] = " " self.jobconfig['job vacation signal'] = None # overrideable methods self.set_directives() self.set_job_vacation_signal() self.set_scripting() self.set_environment()
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if (self.single_task_mode or (host, owner) in self.init_host_map or not is_remote(host, owner)): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) # Handle not having SSL certs installed. try: ssl_cert = self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg) except (SuiteServiceFileError, ValueError): ssl_cert = None cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), user_at_host + ':' + r_suite_srv_dir + '/']) if ssl_cert is not None: cmds[-1].insert(-1, ssl_cert) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def init_suite_run_dir(self, reg, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (USER, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite job log directory", host, owner) r_suite_srv_dir = os.path.join(r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join(self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, "wb").close() proc = Popen( shlex.split(ssh_tmpl) + ["-n", user_at_host, "test", "-e", os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE, ) if proc.wait() == 0: # Initialised, but no need to tidy up self.initialised_hosts[user_at_host] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append( shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir, r_suite_srv_dir] ) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item("task communication method", host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmds.append( shlex.split(scp_tmpl) + [ "-p", self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ":" + r_suite_srv_dir + "/", ] ) # Command to copy python library to remote host. suite_run_py = os.path.join(GLOBAL_CFG.get_derived_host_item(reg, "suite run directory"), "python") if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + ["-pr", suite_run_py, user_at_host + ":" + r_suite_run_dir + "/"]) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) self.initialised_hosts[user_at_host] = should_unlink LOG.info("Initialised %s:%s" % (user_at_host, r_suite_run_dir))
def _write_environment_1(self, handle, job_conf): """Suite and task environment.""" handle.write("\n\n# CYLC SUITE ENVIRONMENT:") # write the static suite variables for var, val in sorted(self.suite_env.items()): handle.write("\nexport " + var + "=" + str(val)) if str(self.suite_env.get("CYLC_UTC")) == "True": handle.write("\nexport TZ=UTC") handle.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite work directory", job_conf["host"], job_conf["owner"] ) st_env = {} st_env["CYLC_SUITE_RUN_DIR"] = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite run directory", job_conf["host"], job_conf["owner"] ) st_env["CYLC_SUITE_WORK_DIR"] = suite_work_dir st_env["CYLC_SUITE_SHARE_DIR"] = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite share directory", job_conf["host"], job_conf["owner"] ) # DEPRECATED st_env["CYLC_SUITE_SHARE_PATH"] = "$CYLC_SUITE_SHARE_DIR" rsp = job_conf["remote suite path"] if rsp: st_env["CYLC_SUITE_DEF_PATH"] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env["CYLC_SUITE_DEF_PATH"] = re.sub( os.environ["HOME"], "$HOME", self.suite_env["CYLC_SUITE_DEF_PATH_ON_SUITE_HOST"] ) for var, val in sorted(st_env.items()): handle.write("\nexport " + var + "=" + str(val)) task_work_dir = os.path.join(suite_work_dir, job_conf["work sub-directory"]) use_login_shell = GLOBAL_CFG.get_host_item("use login shell", job_conf["host"], job_conf["owner"]) comms = GLOBAL_CFG.get_host_item("task communication method", job_conf["host"], job_conf["owner"]) task_name, point_string = TaskID.split(job_conf["task id"]) handle.write("\n\n# CYLC TASK ENVIRONMENT:") handle.write("\nexport CYLC_TASK_COMMS_METHOD=" + comms) handle.write("\nexport CYLC_TASK_CYCLE_POINT=" + point_string) handle.write("\nexport CYLC_TASK_CYCLE_TIME=" + point_string) handle.write("\nexport CYLC_TASK_ID=" + job_conf["task id"]) handle.write("\nexport CYLC_TASK_IS_COLDSTART=" + str(job_conf["is cold-start"])) handle.write("\nexport CYLC_TASK_LOG_ROOT=" + job_conf["job file path"]) handle.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str(GLOBAL_CFG.get(["task messaging", "maximum number of tries"])) ) handle.write("\nexport CYLC_TASK_MSG_RETRY_INTVL=" + str(GLOBAL_CFG.get(["task messaging", "retry interval"]))) handle.write("\nexport CYLC_TASK_MSG_TIMEOUT=" + str(GLOBAL_CFG.get(["task messaging", "connection timeout"]))) handle.write("\nexport CYLC_TASK_NAME=" + task_name) handle.write('\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + " ".join(job_conf["namespace hierarchy"]) + '"') handle.write("\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell)) handle.write("\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(job_conf["absolute submit number"])) handle.write("\nexport CYLC_TASK_TRY_NUMBER=" + str(job_conf["try number"])) handle.write("\nexport CYLC_TASK_WORK_DIR=" + task_work_dir) # DEPRECATED handle.write("\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") handle.write("\nexport CYLC_JOB_PID=$$")
def _get_host_item(job_conf, key): """Return host item from GLOBAL_CFG.""" return GLOBAL_CFG.get_host_item(key, job_conf["host"], job_conf["owner"])
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ uri_data = { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object } port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if self.host is None or self.port is None: if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError as exc: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if uri_data["port"] is None: uri_data["port"] = int(lines[0]) self.port = uri_data["port"] except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if uri_data["host"] is None: if len(lines) >= 2: uri_data["host"] = lines[1].strip() else: uri_data["host"] = "localhost" self.host = uri_data["host"] # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % uri_data)
def _load_item_via_ssh(self, item, suite, owner, host, dest_dir=None): """Load item (e.g. passphrase) from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-PASSPHRASE] %(suite)s ' % {'suite': suite} # Extract suite definition directory from remote ~/.cylc/REGDB/SUITE # Attempt to cat passphrase file under suite definition directory script = ( r'''echo -n '%(prefix)s'; ''' r'''sed -n 's/^path=//p' '.cylc/REGDB/%(suite)s' | ''' r'''xargs -I '{}' cat '{}/%(item)s'; ''' r'''echo''' ) % {'prefix': prefix, 'suite': suite, 'item': item} from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', host, owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') # back compat import shlex command = shlex.split(ssh_tmpl) + ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix if item == self.PASSPHRASE_FILE_BASE: content = None for line in out.splitlines(): if line.startswith(prefix): content = line.replace(prefix, '').strip() else: content = [] content_has_started = False for line in out.splitlines(): if line.startswith(prefix): line = line.replace(prefix, '') content_has_started = True if content_has_started: content.append(line) content = "\n".join(content) if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return if dest_dir is not None: if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.chmod(dest_dir, 0700) dest_item = os.path.join(dest_dir, item) file_handle = open(dest_item, "w") file_handle.write(content) file_handle.close() os.chmod(dest_item, 0600) return dest_item return content
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If old contact file does not exist, do nothing. If old contact file exists, but suite process is definitely not alive, remove old contact file. If old contact file exists and suite process still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % (check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = proc.communicate() if cylc.flags.debug and ret_code: sys.stderr.write("%s # return %d\n%s\n" % (' '.join(cmd), ret_code, err)) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( (r"""ERROR, suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, })
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ uri_data = { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object} port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if self.host is None or self.port is None: if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError as exc: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if uri_data["port"] is None: uri_data["port"] = int(lines[0]) self.port = uri_data["port"] except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if uri_data["host"] is None: if len(lines) >= 2: uri_data["host"] = lines[1].strip() else: uri_data["host"] = "localhost" self.host = uri_data["host"] # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % uri_data)
def detect_old_contact_file(self, reg): """Detect old suite contact file. Raise SuiteServiceFileError if old contact file exists, and there is evidence that the old suite is still running. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["ps", "-opid,args", str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) proc.wait() for line in reversed(proc.communicate()[0].splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break sys.stderr.write( ( r"""ERROR, suite contact file exists: %(fname)s If %(suite)s is not running, delete the suite contact file and try again. If it is running but unresponsive, kill any left over suite processes too. To see if %(suite)s is running on '%(host)s:%(port)s': * cylc scan -n '\b%(suite)s\b' '%(host)s' * cylc ping -v --host='%(host)s' '%(suite)s' * ssh -n '%(host)s' 'ps -o pid,args %(pid)s' """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } ) raise SuiteServiceFileError( "ERROR, suite contact file exists: %s" % fname)
def run(self): """Invoke the tailer.""" command = [] if ":" in self.filename: # remote user_at_host, filename = self.filename.split(':') if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = (None, user_at_host) ssh = str( GLOBAL_CFG.get_host_item("remote shell template", host, owner)).replace(" %s", "") command = shlex.split(ssh) + ["-n", user_at_host] cmd_tmpl = str( GLOBAL_CFG.get_host_item("remote tail command template", host, owner)) else: filename = self.filename cmd_tmpl = str( GLOBAL_CFG.get_host_item("local tail command template")) if self.cmd_tmpl: cmd_tmpl = self.cmd_tmpl command += shlex.split(cmd_tmpl % {"filename": filename}) try: self.proc = Popen(command, stdout=PIPE, stderr=STDOUT, preexec_fn=os.setpgrp) except OSError as exc: # E.g. ssh command not found dialog = warning_dialog( "%s: %s" % (exc, " ".join([quote(item) for item in command]))) gobject.idle_add(dialog.warn) return poller = select.poll() poller.register(self.proc.stdout.fileno()) buf = "" while not self.quit and self.proc.poll() is None: try: self.pollable.poll() except (TypeError, AttributeError): pass if self.freeze or not poller.poll(100): # 100 ms timeout sleep(1) continue # Both self.proc.stdout.read(SIZE) and self.proc.stdout.readline() # can block. However os.read(FILENO, SIZE) should be fine after a # poller.poll(). try: data = os.read(self.proc.stdout.fileno(), self.READ_SIZE) except (IOError, OSError) as exc: dialog = warning_dialog( "%s: %s" % (exc, " ".join([quote(item) for item in command]))) gobject.idle_add(dialog.warn) break if data: # Manage buffer, only add full lines to display to ensure # filtering and tagging work for line in data.splitlines(True): if not line.endswith("\n"): buf += line continue elif buf: line = buf + line buf = "" if (not self.filters or all([re.search(f, line) for f in self.filters])): gobject.idle_add(self.update_gui, line) sleep(0.01) self.stop()
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ if ((self.host is None or self.port is None) and 'CYLC_SUITE_RUN_DIR' in os.environ): # Looks like we are in a running task job, so we should be able to # use "cylc-suite-env" file under the suite running directory try: suite_env = CylcSuiteEnv.load(self.suite, os.environ['CYLC_SUITE_RUN_DIR']) except CylcSuiteEnvLoadError: if cylc.flags.debug: traceback.print_exc() else: self.host = suite_env.suite_host self.port = suite_env.suite_port self.owner = suite_env.suite_owner if self.host is None or self.port is None: port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err } raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if self.port is None: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname() # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object })
def detect_old_contact_file(self, reg): """Detect old suite contact file. Raise SuiteServiceFileError if old contact file exists, and there is evidence that the old suite is still running. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["ps", "-opid,args", str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_tmpl = str(GLOBAL_CFG.get_host_item( "remote shell template", old_host)) cmd = shlex.split(ssh_tmpl) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) proc.wait() for line in reversed(proc.communicate()[0].splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break sys.stderr.write( ( r"""ERROR, suite contact file exists: %(fname)s If %(suite)s is not running, delete the suite contact file and try again. If it is running but unresponsive, kill any left over suite processes too. To see if %(suite)s is running on '%(host)s:%(port)s': * cylc scan -n '\b%(suite)s\b' '%(host)s' * cylc ping -v --host='%(host)s' '%(suite)s' * ssh -n '%(host)s' 'ps -o pid,args %(pid)s' """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } ) raise SuiteServiceFileError( "ERROR, suite contact file exists: %s" % fname)
def update_suites_info(updater, full_mode=False): """Return mapping of suite info by host, owner and suite name. Args: updater (object): gscan or gpanel updater: Compulsory attributes from updater: hosts: hosts to scan owner_pattern: re to filter results by owners suite_info_map: previous results returned by this function Optional attributes from updater: timeout: communication timeout full_mode (boolean): update in full mode? Return: dict: {(host, owner, name): suite_info, ...} where each "suite_info" is a dict with keys: KEY_GROUP: group name of suite KEY_OWNER: suite owner name KEY_PORT: suite port, for running suites only KEY_STATES: suite state KEY_TASKS_BY_STATE: tasks by state KEY_TITLE: suite title KEY_UPDATE_TIME: last update time of suite """ # Compulsory attributes from updater # hosts - hosts to scan, or the default set in the site/user global.rc # owner_pattern - return only suites with owners matching this compiled re # suite_info_map - previous results returned by this function # Optional attributes from updater # timeout - communication timeout owner_pattern = updater.owner_pattern timeout = getattr(updater, "comms_timeout", None) # name_pattern - return only suites with names matching this compiled re name_pattern = getattr(updater, "name_pattern", None) # Determine items to scan results = {} items = [] if full_mode and not updater.hosts: # Scan users suites. Walk "~/cylc-run/" to get (host, port) from # ".service/contact" for active suites suite_srv_files_mgr = SuiteSrvFilesManager() if owner_pattern is None: # Run directory of current user only run_dirs = [GLOBAL_CFG.get_host_item('run directory')] else: # Run directory of all users matching "owner_pattern". # But skip those with /nologin or /false shells run_dirs = [] skips = ('/false', '/nologin') for pwent in getpwall(): if any(pwent.pw_shell.endswith(s) for s in (skips)): continue if owner_pattern.match(pwent.pw_name): run_dirs.append( GLOBAL_CFG.get_host_item('run directory', owner=pwent.pw_name, owner_home=pwent.pw_dir)) if cylc.flags.debug: sys.stderr.write( 'Listing suites:%s%s\n' % (_UPDATE_DEBUG_DELIM, _UPDATE_DEBUG_DELIM.join(run_dirs))) for run_d in run_dirs: for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): if updater.quit: return # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (suite_srv_files_mgr.DIR_BASE_SRV in dnames or 'cylc-suite.db' in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) try: contact_data = suite_srv_files_mgr.load_contact_file(reg) except (SuiteServiceFileError, IOError, TypeError, ValueError): continue else: items.append((contact_data[suite_srv_files_mgr.KEY_HOST], contact_data[suite_srv_files_mgr.KEY_PORT])) elif full_mode: # Scan full port range on all hosts items.extend(updater.hosts) else: # Scan suites in previous results only for (host, owner, name), prev_result in updater.suite_info_map.items(): port = prev_result.get(KEY_PORT) if port: items.append((host, port)) else: results[(host, owner, name)] = prev_result if not items: return results if cylc.flags.debug: sys.stderr.write( 'Scan items:%s%s\n' % (_UPDATE_DEBUG_DELIM, _UPDATE_DEBUG_DELIM.join(str(item) for item in items))) # Scan for host, port, result in scan_many(items, timeout=timeout, updater=updater): if updater.quit: return if (name_pattern and not name_pattern.match(result[KEY_NAME]) or owner_pattern and not owner_pattern.match(result[KEY_OWNER])): continue try: result[KEY_PORT] = port results[(host, result[KEY_OWNER], result[KEY_NAME])] = result result[KEY_UPDATE_TIME] = int(float(result[KEY_UPDATE_TIME])) except (KeyError, TypeError, ValueError): pass expire_threshold = time() - DURATION_EXPIRE_STOPPED for (host, owner, name), prev_result in updater.suite_info_map.items(): if updater.quit: return if ((host, owner, name) in results or owner_pattern and not owner_pattern.match(owner) or name_pattern and not name_pattern.match(name)): # OK if suite already in current results set. # Don't bother if: # * previous owner does not match current owner pattern # * previous suite name does not match current name pattern continue if prev_result.get(KEY_PORT): # A previously running suite is no longer running. # Get suite info with "cat-state", if possible, and include in the # results set. try: prev_result = _update_stopped_suite_info((host, owner, name)) except (IndexError, TypeError, ValueError): continue if prev_result.get(KEY_UPDATE_TIME, 0) > expire_threshold: results[(host, owner, name)] = prev_result return results
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if ((host, owner) in [('localhost', None), ('localhost', USER)] or (host, owner) in self.init_host_map or self.single_task_mode): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ':' + r_suite_srv_dir + '/']) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def _write_environment_1(self, handle, job_conf): """Suite and task environment.""" handle.write("\n\n# CYLC SUITE ENVIRONMENT:") # write the static suite variables for var, val in sorted(self.suite_env.items()): handle.write("\nexport " + var + "=" + str(val)) if str(self.suite_env.get('CYLC_UTC')) == 'True': handle.write("\nexport TZ=UTC") handle.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite work directory', job_conf['host'], job_conf['owner']) st_env = {} st_env['CYLC_SUITE_RUN_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite run directory', job_conf['host'], job_conf['owner']) st_env['CYLC_SUITE_WORK_DIR'] = suite_work_dir st_env['CYLC_SUITE_SHARE_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite share directory', job_conf['host'], job_conf['owner']) # DEPRECATED st_env['CYLC_SUITE_SHARE_PATH'] = '$CYLC_SUITE_SHARE_DIR' rsp = job_conf['remote suite path'] if rsp: st_env['CYLC_SUITE_DEF_PATH'] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env['CYLC_SUITE_DEF_PATH'] = re.sub( os.environ['HOME'], '$HOME', self.suite_env['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST']) for var, val in sorted(st_env.items()): handle.write("\nexport " + var + "=" + str(val)) task_work_dir = os.path.join( suite_work_dir, job_conf['work sub-directory']) use_login_shell = GLOBAL_CFG.get_host_item( 'use login shell', job_conf['host'], job_conf['owner']) comms = GLOBAL_CFG.get_host_item( 'task communication method', job_conf['host'], job_conf['owner']) task_name, point_string = TaskID.split(job_conf['task id']) handle.write("\n\n# CYLC TASK ENVIRONMENT:") handle.write("\nexport CYLC_TASK_COMMS_METHOD=" + comms) handle.write("\nexport CYLC_TASK_CYCLE_POINT=" + point_string) handle.write("\nexport CYLC_TASK_CYCLE_TIME=" + point_string) handle.write("\nexport CYLC_TASK_ID=" + job_conf['task id']) handle.write( "\nexport CYLC_TASK_IS_COLDSTART=" + str(job_conf['is cold-start'])) handle.write( "\nexport CYLC_TASK_LOG_ROOT=" + job_conf['job file path']) handle.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str(GLOBAL_CFG.get(['task messaging', 'maximum number of tries']))) handle.write( "\nexport CYLC_TASK_MSG_RETRY_INTVL=%f" % GLOBAL_CFG.get(['task messaging', 'retry interval'])) handle.write( "\nexport CYLC_TASK_MSG_TIMEOUT=%f" % GLOBAL_CFG.get(['task messaging', 'connection timeout'])) handle.write("\nexport CYLC_TASK_NAME=" + task_name) handle.write( '\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + ' '.join(job_conf['namespace hierarchy']) + '"') handle.write( "\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell)) handle.write( "\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(job_conf['submit num'])) handle.write( "\nexport CYLC_TASK_TRY_NUMBER=" + str(job_conf['try number'])) handle.write("\nexport CYLC_TASK_WORK_DIR=" + task_work_dir) # DEPRECATED handle.write("\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") handle.write("\nexport %s=$$" % (TaskMessage.CYLC_JOB_PID))
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ if ((self.host is None or self.port is None) and 'CYLC_SUITE_RUN_DIR' in os.environ): # Looks like we are in a running task job, so we should be able to # use "cylc-suite-env" file under the suite running directory try: suite_env = CylcSuiteEnv.load( self.suite, os.environ['CYLC_SUITE_RUN_DIR']) except CylcSuiteEnvLoadError: if cylc.flags.debug: traceback.print_exc() else: self.host = suite_env.suite_host self.port = suite_env.suite_port self.owner = suite_env.suite_owner if self.host is None or self.port is None: port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err} raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if self.port is None: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname() # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object})