def test_get_host_from_platform_fails_bad_method(): platform = TEST_PLATFORM.copy() platform['selection']['method'] = 'roulette' with pytest.raises(CylcError) as err: get_host_from_platform(platform, {'Elephant'}) assert err.exconly() == ( 'cylc.flow.exceptions.CylcError: method "roulette" is not a ' 'supported host selection method.')
def get_task_auth(self, suite_name: str, task_name: str) -> Union[str, None]: """Get host for a remote task from a Cylc workflow definition. Returns: Hostname or None if: - task does not run remotely. - task has not been defined. """ # n.b. Imports inside function to avoid dependency on Cylc and # Cylc-Rose is Rose is being used with a different workflow engine. from cylc.flow.platforms import get_host_from_platform from cylc.flow.hostuserutil import is_remote_platform from cylc.rose.platform_utils import get_platform_from_task_def # Check whether task has been defined. try: platform = get_platform_from_task_def(suite_name, task_name) except KeyError: return None else: # If task has been defined return host: if is_remote_platform(platform): return get_host_from_platform(platform) else: return None
def _setup_job_logs_retrieval(self, itask, event): """Set up remote job logs retrieval. For a task with a job completion event, i.e. succeeded, failed, (execution) retry. """ id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point), itask.tdef.name, itask.submit_num) events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED) host = get_host_from_platform(itask.platform) if (event not in events or not is_remote_host(host) or not self.get_host_conf(itask, "retrieve job logs") or id_key in self.event_timers): return retry_delays = self.get_host_conf(itask, "retrieve job logs retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskJobLogsRetrieveContext( self.HANDLER_JOB_LOGS_RETRIEVE, # key self.HANDLER_JOB_LOGS_RETRIEVE, # ctx_type itask.platform['name'], self.get_host_conf(itask, "retrieve job logs max size"), ), retry_delays)
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" platform = get_platform(ctx.platform_n) ssh_str = str(platform["ssh command"]) rsync_str = str(platform["retrieve job logs command"]) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if LOG.isEnabledFor(DEBUG): cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size, )) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append("%s:%s/" % (get_host_from_platform(platform), get_remote_suite_run_job_dir(platform, schd_ctx.suite))) # Local target cmd.append(get_suite_run_job_dir(schd_ctx.suite) + "/") self.proc_pool.put_command( SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def get_suite_jobs_auths( self, suite_name: str, cycle_name_tuples: Tuple[Any] = None) -> List[str]: """Get hosts of jobs from a Cylc workflow database. returns: list of hostname strings. """ # n.b. Imports inside function to avoid dependency on Cylc and # Cylc-Rose is Rose is being used with a different workflow engine. from cylc.flow.platforms import get_host_from_platform from cylc.rose.platform_utils import get_platforms_from_task_jobs task_platforms = {} if cycle_name_tuples is not None: for cycle, name in cycle_name_tuples: new_platforms = get_platforms_from_task_jobs(suite_name, cycle) task_platforms[cycle] = new_platforms # For each platform get a list of hosts. hosts = [] for cycle, tasks in task_platforms.items(): for platform in tasks.values(): hosts.append(get_host_from_platform(platform)) hosts = list(set(hosts)) return hosts
def main(_, options: 'Values', *ids) -> None: workflow_id, _, flow_file = parse_id( *ids, src=True, constraint='workflows', ) # extract task host platforms from the workflow_id config = WorkflowConfig( workflow_id, flow_file, options, load_template_vars(options.templatevars, options.templatevars_file)) platforms = { config.get_config(['runtime', name, 'platform']) for name in config.get_namespace_list('all tasks') } - {None, 'localhost'} # When "workflow run hosts" are formalised as "flow platforms" # we can substitute `localhost` for this, in the mean time # we will have to assume that flow hosts are configured correctly. if not platforms: sys.exit(0) verbose = cylc.flow.flags.verbosity > 0 # get the cylc version on each platform versions = {} for platform_name in sorted(platforms): platform = get_platform(platform_name) host = get_host_from_platform(platform, bad_hosts=None) cmd = construct_ssh_cmd(['version'], platform, host) if verbose: print(cmd) proc = procopen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() out = out.decode() err = err.decode() if proc.wait() == 0: if verbose: print(" %s" % out) versions[platform_name] = out.strip() else: versions[platform_name] = f'ERROR: {err.strip()}' # report results max_len = max((len(platform_name) for platform_name in platforms)) print(f'{"platform".rjust(max_len)}: cylc version') print('-' * (max_len + 14)) for platform_name, result in versions.items(): print(f'{platform_name.rjust(max_len)}: {result}') if all((version == CYLC_VERSION for version in versions.values())): ret_code = 0 elif options.error: ret_code = 1 else: ret_code = 0 sys.exit(ret_code)
def construct_remote_tidy_ssh_cmd(install_target, platform): cmd = ['remote-tidy'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(install_target) cmd.append(get_remote_workflow_run_dir(self.workflow)) host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s') return cmd, host
def insert_db_job(self, row_idx, row): """Load job element from DB post restart.""" if row_idx == 0: LOG.info("LOADING job data") (point_string, name, status, submit_num, time_submit, time_run, time_run_exit, batch_sys_name, batch_sys_job_id, platform_name) = row if status not in JOB_STATUS_SET: return t_id = f'{self.workflow_id}{ID_DELIM}{point_string}{ID_DELIM}{name}' j_id = f'{t_id}{ID_DELIM}{submit_num}' try: tdef = self.schd.config.get_taskdef(name) j_owner = self.schd.owner if platform_name: j_host = get_host_from_platform(get_platform(platform_name)) else: j_host = self.schd.host j_buf = PbJob( stamp=f'{j_id}@{time()}', id=j_id, submit_num=submit_num, state=status, task_proxy=t_id, submitted_time=time_submit, started_time=time_run, finished_time=time_run_exit, batch_sys_name=batch_sys_name, batch_sys_job_id=batch_sys_job_id, host=j_host, owner=j_owner, name=name, cycle_point=point_string, ) # Add in log files. j_buf.job_log_dir = get_task_job_log(self.schd.suite, point_string, name, submit_num) overrides = self.schd.task_events_mgr.broadcast_mgr.get_broadcast( TaskID.get(name, point_string)) if overrides: rtconfig = pdeepcopy(tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = tdef.rtconfig j_buf.extra_logs.extend([ os.path.expanduser(os.path.expandvars(log_file)) for log_file in rtconfig['extra log files'] ]) except SuiteConfigError: LOG.exception( ('ignoring job %s from the suite run database\n' '(its task definition has probably been deleted).') % j_id) except Exception: LOG.exception('could not load job %s' % j_id) else: self.added[j_id] = j_buf self.task_jobs.setdefault(t_id, set()).add(j_id) self.updates_pending = True
def construct_remote_tidy_ssh_cmd( platform: Dict[str, Any]) -> Tuple[List[str], str]: cmd = ['remote-tidy'] cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity)) cmd.append(get_install_target_from_platform(platform)) cmd.append(get_remote_workflow_run_dir(self.workflow)) host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s') return cmd, host
def remote_tidy(self): """Remove suite contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for platform, init_with_contact in self.remote_init_map.items(): platform = get_platform(platform) host = get_host_from_platform(platform) owner = platform['owner'] self.install_target = get_install_target_from_platform(platform) if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['remote-tidy'] if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(str(f'{self.install_target}')) cmd.append(get_remote_suite_run_dir(platform, self.suite)) if is_remote_platform(platform): cmd = construct_platform_ssh_cmd(cmd, platform, timeout='10s') else: cmd = ['cylc'] + cmd procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_cylc_cmd(cmd, platform, **kwargs): """Execute a Cylc command on a remote platform. Uses the platform configuration to construct the command. See _construct_ssh_cmd for argument documentation. """ return _remote_cylc_cmd(cmd, host=get_host_from_platform(platform), ssh_cmd=platform['ssh command'], remote_cylc_path=platform['cylc path'], ssh_login_shell=platform['use login shell'], **kwargs)
def construct_ssh_cmd(raw_cmd, platform, **kwargs): """Build an SSH command for execution on a remote platform. Constructs the SSH command according to the platform configuration. See _construct_ssh_cmd for argument documentation. """ return _construct_ssh_cmd(raw_cmd, host=get_host_from_platform(platform), ssh_cmd=platform['ssh command'], remote_cylc_path=platform['cylc path'], ssh_login_shell=platform['use login shell'], **kwargs)
def construct_platform_ssh_cmd(raw_cmd, platform, **kwargs): """A wrapper around `construct_ssh_cmd` allowing us to pass a platform object rather than a user and host. Args: All as `construct_ssh_cmd` except for user and host. """ ret = construct_ssh_cmd(raw_cmd, host=get_host_from_platform(platform), ssh_cmd=platform['ssh command'], ssh_cylc=platform['cylc executable'], ssh_login_shell=platform['use login shell'], **kwargs) return ret
def construct_rsync_over_ssh_cmd(src_path: str, dst_path: str, platform: Dict[str, Any], rsync_includes=None, bad_hosts=None) -> Tuple[List[str], str]: """Constructs the rsync command used for remote file installation. Includes as standard the directories: app, bin, etc, lib; and the server key, used for ZMQ authentication. Args: src_path: source path dst_path: path of target platform: contains info relating to platform rsync_includes: files and directories to be included in the rsync Developer Warning: The Cylc Subprocess Pool method ``rsync_255_fail`` relies on ``rsync_cmd[0] == 'rsync'``. Please check that changes to this funtion do not break ``rsync_255_fail``. """ dst_host = get_host_from_platform(platform, bad_hosts=bad_hosts) ssh_cmd = platform['ssh command'] command = platform['rsync command'] rsync_cmd = shlex.split(command) rsync_options = [ "--delete", "--rsh=" + ssh_cmd, "--include=/.service/", "--include=/.service/server.key" ] + DEFAULT_RSYNC_OPTS # Note to future devs - be wary of changing the order of the following # rsync options, rsync is very particular about order of in/ex-cludes. rsync_cmd.extend(rsync_options) for exclude in ['log', 'share', 'work']: rsync_cmd.append(f"--exclude={exclude}") default_includes = ['/app/***', '/bin/***', '/etc/***', '/lib/***'] for include in default_includes: rsync_cmd.append(f"--include={include}") for include in get_includes_to_rsync(rsync_includes): rsync_cmd.append(f"--include={include}") # The following excludes are required in case these are added to the rsync_cmd.append("--exclude=*") # exclude everything else rsync_cmd.append(f"{src_path}/") rsync_cmd.append(f"{dst_host}:{dst_path}/") return rsync_cmd, dst_host
def construct_rsync_over_ssh_cmd( src_path, dst_path, platform, rsync_includes=None): """Constructs the rsync command used for remote file installation. Includes as standard the directories: app, bin, etc, lib; and the server key, used for ZMQ authentication. Args: src_path(string): source path dst_path(string): path of target platform(dict)): contains info relating to platform rsync_includes(list): files and directories to be included in the rsync """ dst_host = get_host_from_platform(platform) ssh_cmd = platform['ssh command'] rsync_cmd = [ "rsync", "--delete", "--rsh=" + ssh_cmd, "--include=/.service/", "--include=/.service/server.key" ] + DEFAULT_RSYNC_OPTS # Note to future devs - be wary of changing the order of the following # rsync options, rsync is very particular about order of in/ex-cludes. for exclude in ['log', 'share', 'work']: rsync_cmd.append(f"--exclude={exclude}") default_includes = [ '/app/***', '/bin/***', '/etc/***', '/lib/***'] for include in default_includes: rsync_cmd.append(f"--include={include}") for include in get_includes_to_rsync(rsync_includes): rsync_cmd.append(f"--include={include}") # The following excludes are required in case these are added to the rsync_cmd.append("--exclude=*") # exclude everything else rsync_cmd.append(f"{src_path}/") rsync_cmd.append(f"{dst_host}:{dst_path}/") return rsync_cmd
def get_task_auth(self, suite_name: str, task_name: str) -> Union[str, None]: """Get host for a remote task from a Cylc workflow definition. Returns: Hostname, or None if: - task does not run remotely. - task has not been defined. - cylc-rose is not installed(*) (*) This function is only used by the fcm_make built-in app. Returning None is equivalent to there being no fcm_make2 task found or no workflow file found which is fine - 2 stage fcm_make is only supported on the localhost install target (the workflow files aren't mirrored). """ # n.b. Imports inside function to avoid dependency on Cylc and # Cylc-Rose is Rose is being used with a different workflow engine. from cylc.flow.exceptions import WorkflowFilesError from cylc.flow.hostuserutil import is_remote_platform from cylc.flow.platforms import get_host_from_platform try: from cylc.rose.platform_utils import get_platform_from_task_def except ModuleNotFoundError: # Allow single stage fcm_make app to work without requiring # cylc.rose return None try: platform = get_platform_from_task_def(suite_name, task_name) except KeyError: return None except (WorkflowFilesError): raise WorkflowFileNotFoundError else: if platform is None: return 'localhost' # If task has been defined return host: if is_remote_platform(platform): return get_host_from_platform(platform) else: return None
def test_get_host_from_platform_fails_no_goodhosts(): platform = TEST_PLATFORM with pytest.raises(NoHostsError) as err: get_host_from_platform(platform, {'nellie', 'dumbo', 'jumbo'}) assert err.exconly() == ('cylc.flow.exceptions.NoHostsError: ' 'Unable to find valid host for Elephant')
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare for job submission and submit task jobs. Preparation (host selection, remote host init, and remote install) is done asynchronously. Newly released tasks may be sent here several times until these init subprocesses have returned. Failure during preparation is considered to be job submission failure. Once preparation has completed or failed, reset .waiting_on_job_prep in task instances so the scheduler knows to stop sending them back here. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: platform_name = itask.platform['name'] auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for platform_name, itasks in sorted(auth_itasks.items()): platform = itasks[0].platform install_target = get_install_target_from_platform(platform) ri_map = self.task_remote_mgr.remote_init_map if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE): if install_target == get_localhost_install_target(): # Skip init and file install for localhost. LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}") ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE) elif install_target not in ri_map: # Remote init not in progress for target, so start it. self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue elif (ri_map[install_target] == REMOTE_INIT_DONE): # Already done remote init so move on to file install self.task_remote_mgr.file_install(platform) continue elif (ri_map[install_target] in self.IN_PROGRESS.keys()): # Remote init or file install in progress. for itask in itasks: msg = self.IN_PROGRESS[ri_map[install_target]] itask.set_summary_message(msg) self.data_store_mgr.delta_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), msg) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.job_runner_mgr.is_job_local_to_host( itask.summary['job_runner_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': itask.platform['name'], 'job_runner_name': itask.summary['job_runner_name'], }) itask.is_manual_submit = False if (ri_map[install_target] in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]): # Remote init or install failed. Set submit-failed for all # affected tasks and remove target from remote init map # - this enables new tasks to re-initialise that target init_error = (ri_map[install_target]) del ri_map[install_target] for itask in itasks: itask.waiting_on_job_prep = False itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=init_error, ret_code=1), suite, itask.point, itask.tdef.name) self._prep_submit_task_job_error(suite, itask, '(remote init)', '') continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False if itask.platform['clean job submission environment']: cmd.append('--clean-env') for var in itask.platform[ 'job submission environment pass-through']: cmd.append(f"--env={var}") for path in itask.platform[ 'job submission executable paths'] + SYSPATH: cmd.append(f"--path={path}") cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = (len(itasks) // ( (len(itasks) // platform['max batch submit size']) + 1) + 1) itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) itask.waiting_on_job_prep = False self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def test_get_host_from_platform(badhosts, expect): platform = TEST_PLATFORM assert get_host_from_platform(platform, badhosts) == expect
def remote_init(self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator', client_pub_key_dir: str) -> None: """Initialise a remote host if necessary. Call "cylc remote-init" to install workflow items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: platform: A dict containing settings relating to platform used in this remote installation. curve_auth: The ZMQ authenticator. client_pub_key_dir: Client public key directory, used by the ZMQ authenticator. """ install_target = platform['install target'] if install_target == get_localhost_install_target(): self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE return # Set status of install target to in progress while waiting for remote # initialisation to finish self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS # Determine what items to install comms_meth: CommsMeth = CommsMeth(platform['communication method']) items = self._remote_init_items(comms_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity)) cmd.append(str(install_target)) cmd.append(get_remote_workflow_run_dir(self.workflow)) dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow) for key, value in dirs_to_symlink.items(): if value is not None: cmd.append(f"{key}={quote(value)} ") # Create the ssh command try: host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) except NoHostsError as exc: LOG.error( PlatformError( f'{PlatformError.MSG_INIT}\n{exc}', platform['name'], )) self.remote_init_map[ platform['install target']] = REMOTE_INIT_FAILED self.bad_hosts -= set(platform['hosts']) self.ready = True else: log_platform_event('remote init', platform, host) cmd = construct_ssh_cmd(cmd, platform, host) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle], host=host), bad_hosts=self.bad_hosts, callback=self._remote_init_callback, callback_args=[ platform, tmphandle, curve_auth, client_pub_key_dir ], callback_255=self._remote_init_callback_255, callback_255_args=[platform])
def remote_init(self, platform, curve_auth, client_pub_key_dir): """Initialise a remote [owner@]host if necessary. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: curve_auth (ThreadAuthenticator): The ZMQ authenticator. client_pub_key_dir (str): Client public key directory, used by the ZMQ authenticator. platform (dict): A dictionary containing settings relating to platform used in this remote installation. Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ self.install_target = platform['install target'] # If task is running locally or the install target is localhost # we can skip the rest of this function if (self.install_target == 'localhost' or self.single_task_mode or not is_remote_host(get_host_from_platform(platform))): LOG.debug(f"REMOTE INIT NOT REQUIRED for {self.install_target}") return REMOTE_INIT_NOT_REQUIRED # See if a previous failed attempt to initialize this platform has # occurred. try: status = self.remote_init_map[platform['install target']] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[platform['install target']] return status # Determine what items to install comm_meth = platform['communication method'] # Get a list of files and folders to install; # if nothing needs install say so to remote_init_map and return. items = self._remote_init_items(comm_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] if cylc.flow.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.install_target)) cmd.append(get_remote_suite_run_dir(platform, self.suite)) # Create the ssh command cmd = construct_platform_ssh_cmd(cmd, platform) self.proc_pool.put_command( SubProcContext( 'remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [platform, tmphandle, curve_auth, client_pub_key_dir]) # None status: Waiting for command to finish self.remote_init_map[platform['install target']] = None return self.remote_init_map[platform['install target']]
def main(parser, options, *args, color=False): """Implement cylc cat-log CLI. Determine log path, user@host, batchview_cmd, and action (print, dir-list, cat, edit, or tail), and then if the log path is: a) local: perform action on log path, or b) remote: re-invoke cylc cat-log as a) on the remote account """ if options.remote_args: # Invoked on job hosts for job logs only, as a wrapper to view_log(). # Tail and batchview commands come from global config on suite host). logpath, mode, tail_tmpl = options.remote_args[0:3] logpath = os.path.expandvars(logpath) tail_tmpl = os.path.expandvars(tail_tmpl) try: batchview_cmd = options.remote_args[3] except IndexError: batchview_cmd = None res = view_log(logpath, mode, tail_tmpl, batchview_cmd, remote=True, color=color) if res == 1: sys.exit(res) return suite_name = args[0] # Get long-format mode. try: mode = MODES[options.mode] except KeyError: mode = options.mode if len(args) == 1: # Cat suite logs, local only. if options.filename is not None: raise UserInputError("The '-f' option is for job logs only.") logpath = get_suite_run_log_name(suite_name) if options.rotation_num: logs = glob('%s.*' % logpath) logs.sort(key=os.path.getmtime, reverse=True) try: logpath = logs[int(options.rotation_num)] except IndexError: raise UserInputError("max rotation %d" % (len(logs) - 1)) tail_tmpl = os.path.expandvars(get_platform()["tail command template"]) out = view_log(logpath, mode, tail_tmpl, color=color) if out == 1: sys.exit(1) if mode == 'edit': tmpfile_edit(out, options.geditor) return if len(args) == 2: # Cat task job logs, may be on suite or job host. if options.rotation_num is not None: raise UserInputError("only suite (not job) logs get rotated") task_id = args[1] try: task, point = TaskID.split(task_id) except ValueError: parser.error("Illegal task ID: %s" % task_id) if options.submit_num != NN: try: options.submit_num = "%02d" % int(options.submit_num) except ValueError: parser.error("Illegal submit number: %s" % options.submit_num) if options.filename is None: options.filename = JOB_LOG_OUT else: # Convert short filename args to long (e.g. 'o' to 'job.out'). try: options.filename = JOB_LOG_OPTS[options.filename] except KeyError: # Is already long form (standard log, or custom). pass platform_name, batch_sys_name, live_job_id = get_task_job_attrs( suite_name, point, task, options.submit_num) platform = get_platform(platform_name) batchview_cmd = None if live_job_id is not None: # Job is currently running. Get special batch system log view # command (e.g. qcat) if one exists, and the log is out or err. conf_key = None if options.filename == JOB_LOG_OUT: if mode == 'cat': conf_key = "out viewer" elif mode == 'tail': conf_key = "out tailer" elif options.filename == JOB_LOG_ERR: if mode == 'cat': conf_key = "err viewer" elif mode == 'tail': conf_key = "err tailer" if conf_key is not None: batchview_cmd_tmpl = None try: batchview_cmd_tmpl = platform[conf_key] except KeyError: pass if batchview_cmd_tmpl is not None: batchview_cmd = batchview_cmd_tmpl % { "job_id": str(live_job_id) } log_is_remote = (is_remote_platform(platform) and (options.filename != JOB_LOG_ACTIVITY)) log_is_retrieved = (platform['retrieve job logs'] and live_job_id is None) if log_is_remote and (not log_is_retrieved or options.force_remote): logpath = os.path.normpath( get_remote_suite_run_job_dir(platform, suite_name, point, task, options.submit_num, options.filename)) tail_tmpl = platform["tail command template"] # Reinvoke the cat-log command on the remote account. cmd = ['cat-log'] if cylc.flow.flags.debug: cmd.append('--debug') for item in [logpath, mode, tail_tmpl]: cmd.append('--remote-arg=%s' % quote(item)) if batchview_cmd: cmd.append('--remote-arg=%s' % quote(batchview_cmd)) cmd.append(suite_name) is_edit_mode = (mode == 'edit') try: host = get_host_from_platform(platform) proc = remote_cylc_cmd(cmd, host, capture_process=is_edit_mode, manage=(mode == 'tail')) except KeyboardInterrupt: # Ctrl-C while tailing. pass else: if is_edit_mode: # Write remote stdout to a temp file for viewing in editor. # Only BUFSIZE bytes at a time in case huge stdout volume. out = NamedTemporaryFile() data = proc.stdout.read(BUFSIZE) while data: out.write(data) data = proc.stdout.read(BUFSIZE) os.chmod(out.name, S_IRUSR) out.seek(0, 0) else: # Local task job or local job log. logpath = os.path.normpath( get_suite_run_job_dir(suite_name, point, task, options.submit_num, options.filename)) tail_tmpl = os.path.expandvars(platform["tail command template"]) out = view_log(logpath, mode, tail_tmpl, batchview_cmd, color=color) if mode != 'edit': sys.exit(out) if mode == 'edit': tmpfile_edit(out, options.geditor)
def submit_task_jobs(self, suite, itasks, curve_auth, client_pub_key_dir, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (install target) auth_itasks = {} # {install target: [itask, ...], ...} for itask in prepared_tasks: install_target = get_install_target_from_platform(itask.platform) auth_itasks.setdefault(install_target, []) auth_itasks[install_target].append(itask) # Submit task jobs for each platform done_tasks = bad_tasks for install_target, itasks in sorted(auth_itasks.items()): # Re-fetch a copy of platform platform = itasks[0].platform is_init = self.task_remote_mgr.remote_init(platform, curve_auth, client_pub_key_dir) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) self.job_pool.add_job_msg( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num), self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. host = get_host_from_platform(platform) if (self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_platform(platform)): host = get_host() now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, host) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'platform_name': platform['name'], 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, '(init %s)' % host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = [self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') if is_remote_platform(itask.platform): remote_mode = True cmd.append('--remote-mode') else: remote_mode = False cmd.append('--') cmd.append(get_remote_suite_run_job_dir(platform, suite)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size) ] LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) if remote_mode: cmd = construct_platform_ssh_cmd(cmd, platform) else: cmd = ['cylc'] + cmd for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( os.path.expandvars( get_task_job_job_log(suite, itask.point, itask.tdef.name, itask.submit_num))) job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks