def remote_tidy(self): """Remove suite contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for install_target, message in self.remote_init_map.items(): if message != REMOTE_FILE_INSTALL_DONE: continue if install_target == get_localhost_install_target(): continue platform = get_random_platform_for_install_target(install_target) platform_n = platform['name'] cmd = ['remote-tidy'] if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(install_target) cmd.append(get_remote_suite_run_dir(platform, self.suite)) cmd = construct_ssh_cmd(cmd, platform, timeout='10s') LOG.debug("Removing authentication keys and contact file " f"from remote: \"{install_target}\"") procs[platform_n] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for platform_n, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[platform_n] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for platform_n, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_tidy(self): """Remove suite contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for platform, init_with_contact in self.remote_init_map.items(): platform = get_platform(platform) host = get_host_from_platform(platform) owner = platform['owner'] self.install_target = get_install_target_from_platform(platform) if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['remote-tidy'] if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(str(f'{self.install_target}')) cmd.append(get_remote_suite_run_dir(platform, self.suite)) if is_remote_platform(platform): cmd = construct_platform_ssh_cmd(cmd, platform, timeout='10s') else: cmd = ['cylc'] + cmd procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_init(self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator', client_pub_key_dir: str) -> None: """Initialise a remote host if necessary. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: platform: A dict containing settings relating to platform used in this remote installation. curve_auth: The ZMQ authenticator. client_pub_key_dir: Client public key directory, used by the ZMQ authenticator. """ install_target = platform['install target'] if install_target == get_localhost_install_target(): self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE return # Set status of install target to in progress while waiting for remote # initialisation to finish self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS # Determine what items to install comm_meth = platform['communication method'] items = self._remote_init_items(comm_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(str(install_target)) cmd.append(get_remote_suite_run_dir(platform, self.suite)) dirs_to_symlink = get_dirs_to_symlink(install_target, self.suite) for key, value in dirs_to_symlink.items(): if value is not None: cmd.append(f"{key}={quote(value)} ") # Create the ssh command cmd = construct_ssh_cmd(cmd, platform) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [platform, tmphandle, curve_auth, client_pub_key_dir])
def file_install(self, platform): """Install required files on the remote install target. Included by default in the file installation: Files: .service/server.key (required for ZMQ authentication) Directories: app/ bin/ etc/ lib/ """ install_target = platform['install target'] self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_IN_PROGRESS src_path = get_workflow_run_dir(self.suite) dst_path = get_remote_suite_run_dir(platform, self.suite) install_target = platform['install target'] ctx = SubProcContext( 'file-install', construct_rsync_over_ssh_cmd(src_path, dst_path, platform, self.rsync_includes)) LOG.debug(f"Begin file installation on {install_target}") self.proc_pool.put_command(ctx, self._file_install_callback, [install_target])
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() if item == self.FILE_BASE_CONTACT and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': get_remote_suite_run_dir('localhost', owner, reg), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = (r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % { 'prefix': prefix, 'run_d': get_remote_suite_run_dir(host, owner, reg), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split(glbl_cfg().get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE, DEVNULL # nosec try: proc = Popen(command, stdin=DEVNULL, stdout=PIPE, stderr=PIPE) # nosec except OSError: if cylc.flow.flags.debug: import traceback traceback.print_exc() return out, err = (f.decode() for f in proc.communicate()) ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: LOG.debug( '$ %(command)s # code=%(ret_code)s\n%(err)s', { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, }) return return content
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join(get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(get_remote_suite_run_dir(host, owner, self.suite)) procs[(host, owner)] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication ".service/passphrase": For TCP task communication "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join(get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid_str).encode()) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.uuid_str)) cmd.append(get_remote_suite_run_dir(host, owner, self.suite)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def write(self, local_job_file_path, job_conf, check_syntax=True): """Write each job script section in turn.""" # ########### !!!!!!!! WARNING !!!!!!!!!!! ##################### # BE EXTREMELY WARY OF CHANGING THE ORDER OF JOB SCRIPT SECTIONS # Users may be relying on the existing order (see for example # the comment below on suite bin path being required before # task runtime environment setup). # ############################################################## # Access to cylc must be configured before user environment so # that cylc commands can be used in defining user environment # variables: NEXT_CYCLE=$( cylc cycle-point --offset-hours=6 ) tmp_name = local_job_file_path + '.tmp' run_d = get_remote_suite_run_dir(job_conf['host'], job_conf['owner'], job_conf['suite_name']) try: with open(tmp_name, 'w') as handle: self._write_header(handle, job_conf) self._write_directives(handle, job_conf) self._write_prelude(handle, job_conf) self._write_environment_1(handle, job_conf, run_d) self._write_global_init_script(handle, job_conf) # suite bin access must be before runtime environment # because suite bin commands may be used in variable # assignment expressions: FOO=$(command args). self._write_environment_2(handle, job_conf) self._write_script(handle, job_conf) self._write_epilogue(handle, job_conf, run_d) except IOError as exc: # Remove temporary file try: os.unlink(tmp_name) except OSError: pass raise exc # check syntax if check_syntax: try: proc = Popen(['/bin/bash', '-n', tmp_name], stderr=PIPE, stdin=open(os.devnull)) except OSError as exc: # Popen has a bad habit of not telling you anything if it fails # to run the executable. if exc.filename is None: exc.filename = '/bin/bash' # Remove temporary file try: os.unlink(tmp_name) except OSError: pass raise exc else: if proc.wait(): # This will leave behind the temporary file, # which is useful for debugging syntax errors, etc. raise RuntimeError(proc.communicate()[1].decode()) # Make job file executable mode = (os.stat(tmp_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) os.chmod(tmp_name, mode) os.rename(tmp_name, local_job_file_path)
def _remote_init_callback( self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass self.install_target = platform['install target'] if proc_ctx.ret_code == 0: if REMOTE_INIT_DONE in proc_ctx.out: src_path = get_suite_run_dir(self.suite) dst_path = get_remote_suite_run_dir(platform, self.suite) try: process = procopen(construct_rsync_over_ssh_cmd( src_path, dst_path, platform, self.rsync_includes), stdoutpipe=True, stderrpipe=True, universal_newlines=True) out, err = process.communicate(timeout=600) install_target = platform['install target'] if out: RSYNC_LOG.info( 'File installation information for ' f'{install_target}:\n {out}') if err: LOG.error( 'File installation error on ' f'{install_target}:\n {err}') except Exception as ex: LOG.error(f"Problem during rsync: {ex}") self.remote_init_map[self.install_target] = ( REMOTE_INIT_FAILED) return if "KEYSTART" in proc_ctx.out: regex_result = re.search( 'KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) suite_srv_dir = get_suite_srv_dir(self.suite) public_key = KeyInfo( KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir, install_target=self.install_target ) old_umask = os.umask(0o177) with open( public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve( domain='*', location=(client_pub_key_dir)) for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[self.install_target] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, platform['install target'], ' '.join( quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED
def remote_init(self, platform, curve_auth, client_pub_key_dir): """Initialise a remote [owner@]host if necessary. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: curve_auth (ThreadAuthenticator): The ZMQ authenticator. client_pub_key_dir (str): Client public key directory, used by the ZMQ authenticator. platform (dict): A dictionary containing settings relating to platform used in this remote installation. Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ self.install_target = platform['install target'] # If task is running locally or the install target is localhost # we can skip the rest of this function if (self.install_target == 'localhost' or self.single_task_mode or not is_remote_host(get_host_from_platform(platform))): LOG.debug(f"REMOTE INIT NOT REQUIRED for {self.install_target}") return REMOTE_INIT_NOT_REQUIRED # See if a previous failed attempt to initialize this platform has # occurred. try: status = self.remote_init_map[platform['install target']] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[platform['install target']] return status # Determine what items to install comm_meth = platform['communication method'] # Get a list of files and folders to install; # if nothing needs install say so to remote_init_map and return. items = self._remote_init_items(comm_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] if cylc.flow.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.install_target)) cmd.append(get_remote_suite_run_dir(platform, self.suite)) # Create the ssh command cmd = construct_platform_ssh_cmd(cmd, platform) self.proc_pool.put_command( SubProcContext( 'remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [platform, tmphandle, curve_auth, client_pub_key_dir]) # None status: Waiting for command to finish self.remote_init_map[platform['install target']] = None return self.remote_init_map[platform['install target']]