def _remote_init_items(self, comm_meth): """Return list of items to install based on communication method. Return (list): Each item is (source_path, dest_path) where: - source_path is the path to the source file to install. - dest_path is relative path under suite run directory at target remote. """ items = [] if comm_meth in ['ssh', 'zmq']: # Contact file items.append((get_contact_file(self.suite), os.path.join(SuiteFiles.Service.DIRNAME, SuiteFiles.Service.CONTACT))) if comm_meth in ['zmq']: suite_srv_dir = get_suite_srv_dir(self.suite) server_pub_keyinfo = KeyInfo(KeyType.PUBLIC, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir) client_pri_keyinfo = KeyInfo(KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir) dest_path_srvr_public_key = os.path.join( SuiteFiles.Service.DIRNAME, server_pub_keyinfo.file_name) items.append( (server_pub_keyinfo.full_key_path, dest_path_srvr_public_key)) dest_path_cli_pri_key = os.path.join(SuiteFiles.Service.DIRNAME, client_pri_keyinfo.file_name) items.append( (client_pri_keyinfo.full_key_path, dest_path_cli_pri_key)) return items
def _socket_bind(self, min_port, max_port, srv_prv_key_loc=None): """Bind socket. Will use a port range provided to select random ports. """ if srv_prv_key_loc is None: # Create new KeyInfo object for the server private key suite_srv_dir = get_suite_srv_dir(self.suite) srv_prv_key_info = KeyInfo(KeyType.PRIVATE, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir) else: srv_prv_key_info = KeyInfo(KeyType.PRIVATE, KeyOwner.SERVER, full_key_path=srv_prv_key_loc) # create socket self.socket = self.context.socket(self.pattern) self._socket_options() try: server_public_key, server_private_key = zmq.auth.load_certificate( srv_prv_key_info.full_key_path) except (ValueError): raise SuiteServiceFileError(f"Failed to find server's public " f"key in " f"{srv_prv_key_info.full_key_path}.") except (OSError): raise SuiteServiceFileError(f"IO error opening server's private " f"key from " f"{srv_prv_key_info.full_key_path}.") if server_private_key is None: # this can't be caught by exception raise SuiteServiceFileError(f"Failed to find server's private " f"key in " f"{srv_prv_key_info.full_key_path}.") self.socket.curve_publickey = server_public_key self.socket.curve_secretkey = server_private_key self.socket.curve_server = True try: if min_port == max_port: self.port = min_port self.socket.bind(f'tcp://*:{min_port}') else: self.port = self.socket.bind_to_random_port( 'tcp://*', min_port, max_port) except (zmq.error.ZMQError, zmq.error.ZMQBindError) as exc: raise CylcError(f'could not start Cylc ZMQ server: {exc}') if self.barrier is not None: self.barrier.wait()
def scheduler_cli(parser, options, args, is_restart=False): """CLI main.""" reg = args[0] # Check suite is not already running before start of host selection. try: suite_files.detect_old_contact_file(reg) except SuiteServiceFileError as exc: sys.exit(exc) suite_run_dir = get_suite_run_dir(reg) if not os.path.exists(suite_run_dir): sys.stderr.write(f'suite service directory not found ' f'at: {suite_run_dir}\n') sys.exit(1) # Create auth files if needed. suite_files.create_auth_files(reg) # Extract job.sh from library, for use in job scripts. extract_resources(suite_files.get_suite_srv_dir(reg), ['etc/job.sh']) # Check whether a run host is explicitly specified, else select one. if not options.host: try: host = HostAppointer().appoint_host() except EmptyHostList as exc: if cylc.flow.flags.debug: raise else: sys.exit(str(exc)) if is_remote_host(host): if is_restart: base_cmd = ["restart"] + sys.argv[1:] else: base_cmd = ["run"] + sys.argv[1:] # Prevent recursive host selection base_cmd.append("--host=localhost") return remote_cylc_cmd(base_cmd, host=host) if remrun(set_rel_local=True): # State localhost as above. sys.exit() try: suite_files.get_suite_source_dir(args[0], options.owner) except SuiteServiceFileError: # Source path is assumed to be the run directory suite_files.register(args[0], get_suite_run_dir(args[0])) try: scheduler = Scheduler(is_restart, options, args) except SuiteServiceFileError as exc: sys.exit(exc) scheduler.start()
def _socket_connect(self, host, port, srv_public_key_loc=None): """Connect socket to stub.""" suite_srv_dir = get_suite_srv_dir(self.suite) if srv_public_key_loc is None: # Create new KeyInfo object for the server public key srv_pub_key_info = KeyInfo(KeyType.PUBLIC, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir) else: srv_pub_key_info = KeyInfo(KeyType.PUBLIC, KeyOwner.SERVER, full_key_path=srv_public_key_loc) self.host = host self.port = port self.socket = self.context.socket(self.pattern) self._socket_options() client_priv_key_info = KeyInfo(KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir) error_msg = "Failed to find user's private key, so cannot connect." try: client_public_key, client_priv_key = zmq.auth.load_certificate( client_priv_key_info.full_key_path) except (OSError, ValueError): raise ClientError(error_msg) if client_priv_key is None: # this can't be caught by exception raise ClientError(error_msg) self.socket.curve_publickey = client_public_key self.socket.curve_secretkey = client_priv_key # A client can only connect to the server if it knows its public key, # so we grab this from the location it was created on the filesystem: try: # 'load_certificate' will try to load both public & private keys # from a provided file but will return None, not throw an error, # for the latter item if not there (as for all public key files) # so it is OK to use; there is no method to load only the # public key. server_public_key = zmq.auth.load_certificate( srv_pub_key_info.full_key_path)[0] self.socket.curve_serverkey = server_public_key except (OSError, ValueError): # ValueError raised w/ no public key raise ClientError( "Failed to load the suite's public key, so cannot connect.") self.socket.connect(f'tcp://{host}:{port}')
def _remote_init_callback(self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits. Write public key for install target into client public key directory. Set remote_init__map status to REMOTE_INIT_DONE on success which in turn will trigger file installation to start. Set remote_init_map status to REMOTE_INIT_FAILED on error. """ try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass install_target = platform['install target'] if proc_ctx.ret_code == 0: if "KEYSTART" in proc_ctx.out: regex_result = re.search('KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) suite_srv_dir = get_suite_srv_dir(self.suite) public_key = KeyInfo(KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir, install_target=install_target) old_umask = os.umask(0o177) with open(public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve(domain='*', location=(client_pub_key_dir)) self.remote_init_map[install_target] = REMOTE_INIT_DONE self.ready = True return # Bad status LOG.error( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, install_target, ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED self.ready = True
def key_housekeeping(reg, platform=None, create=True): """Clean any existing authentication keys and create new ones. If create is set to false, keys will only be cleaned from server.""" suite_srv_dir = get_suite_srv_dir(reg) keys = { "client_public_key": KeyInfo(KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir, install_target=platform), "client_private_key": KeyInfo(KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir), "server_public_key": KeyInfo(KeyType.PUBLIC, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir), "server_private_key": KeyInfo(KeyType.PRIVATE, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir) } remove_keys_on_server(keys) if create: create_server_keys(keys, suite_srv_dir)
def test_client_requires_valid_server_public_key_in_private_key_file(): """Client should not be able to connect to host/port without server public key.""" suite_name = f"test_suite-{time()}" port = random.choice(PORT_RANGE) client = ZMQSocketBase(zmq.REP, suite=suite_name) test_suite_srv_dir = get_suite_srv_dir(reg=suite_name) key_info = KeyInfo( KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=test_suite_srv_dir) directory = os.path.expanduser("~/cylc-run") tmpdir = os.path.join(directory, suite_name) os.makedirs(key_info.key_path, exist_ok=True) _pub, _priv = zmq.auth.create_certificates(key_info.key_path, "client") with pytest.raises(ClientError, match=r"Failed to load the suite's public " r"key, so cannot connect."): client.start(HOST, port, srv_public_key_loc="fake_location") client.stop() rmtree(tmpdir, ignore_errors=True)
def setup_keys(suite_name): suite_srv_dir = get_suite_srv_dir(suite_name) server_keys = { "client_public_key": KeyInfo( KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir), "client_private_key": KeyInfo( KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir), "server_public_key": KeyInfo( KeyType.PUBLIC, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir), "server_private_key": KeyInfo( KeyType.PRIVATE, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir) } remove_keys_on_server(server_keys) remove_keys_on_client(suite_srv_dir, None, full_clean=True) create_server_keys(server_keys, suite_srv_dir) create_client_keys(suite_srv_dir, None)
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join(get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(get_remote_suite_run_dir(host, owner, self.suite)) procs[(host, owner)] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication ".service/passphrase": For TCP task communication "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join(get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid_str).encode()) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.uuid_str)) cmd.append(get_remote_suite_run_dir(host, owner, self.suite)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def main(parser, options, suite, *task_ids): """cylc submit CLI. No TASK EVENT HOOKS are set for the submit command because there is no scheduler instance watching for task failure etc. Note: a suite contact env file is not written by this command (it would overwrite the real one if the suite is running). """ if not options.verbose and not options.debug: LOG.setLevel(WARNING) for task_id in task_ids: if not TaskID.is_valid_id(task_id): raise UserInputError("Invalid task ID %s" % task_id) suiterc = get_suite_rc(suite) suite_dir = os.path.dirname(suiterc) # For user-defined batch system handlers sys.path.append(os.path.join(suite_dir, 'python')) # Load suite config and tasks config = SuiteConfig( suite, suiterc, options, load_template_vars(options.templatevars, options.templatevars_file)) itasks = [] for task_id in task_ids: name_str, point_str = TaskID.split(task_id) taskdefs = config.find_taskdefs(name_str) if not taskdefs: raise UserInputError("No task found for %s" % task_id) for taskdef in taskdefs: itasks.append( TaskProxy(taskdef, get_point(point_str).standardise(), is_startup=True)) # Initialise job submit environment make_suite_run_tree(suite) # Extract job.sh from library, for use in job scripts. extract_resources(get_suite_srv_dir(suite), ['etc/job.sh']) pool = SubProcPool() owner = get_user() job_pool = JobPool(suite, owner) db_mgr = SuiteDatabaseManager() task_job_mgr = TaskJobManager( suite, pool, db_mgr, TaskEventsManager(suite, pool, db_mgr, BroadcastMgr(db_mgr), job_pool), job_pool) task_job_mgr.task_remote_mgr.single_task_mode = True task_job_mgr.job_file_writer.set_suite_env({ 'CYLC_UTC': str(config.cfg['cylc']['UTC mode']), 'CYLC_DEBUG': str(cylc.flow.flags.debug).lower(), 'CYLC_VERBOSE': str(cylc.flow.flags.verbose).lower(), 'CYLC_SUITE_NAME': suite, 'CYLC_CYCLING_MODE': str(config.cfg['scheduling']['cycling mode']), 'CYLC_SUITE_INITIAL_CYCLE_POINT': str(config.cfg['scheduling']['initial cycle point']), 'CYLC_SUITE_FINAL_CYCLE_POINT': str(config.cfg['scheduling']['final cycle point']), }) ret_code = 0 waiting_tasks = list(itasks) if options.dry_run: while waiting_tasks: prep_tasks, bad_tasks = task_job_mgr.prep_submit_task_jobs( suite, waiting_tasks, dry_run=True) for itask in prep_tasks + bad_tasks: waiting_tasks.remove(itask) if waiting_tasks: task_job_mgr.proc_pool.process() sleep(1.0) for itask in itasks: if itask.local_job_file_path: print(('JOB SCRIPT=%s' % itask.local_job_file_path)) else: print(('Unable to prepare job file for %s' % itask.identity), file=sys.stderr) ret_code = 1 else: while waiting_tasks: for itask in task_job_mgr.submit_task_jobs(suite, waiting_tasks): waiting_tasks.remove(itask) if waiting_tasks: task_job_mgr.proc_pool.process() sleep(1.0) while task_job_mgr.proc_pool.is_not_done(): task_job_mgr.proc_pool.process() for itask in itasks: if itask.summary.get('submit_method_id') is not None: print(('[%s] Job ID: %s' % (itask.identity, itask.summary['submit_method_id']))) if itask.state(TASK_STATUS_SUBMIT_FAILED): ret_code = 1 sys.exit(ret_code)
def _remote_init_callback( self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass self.install_target = platform['install target'] if proc_ctx.ret_code == 0: if REMOTE_INIT_DONE in proc_ctx.out: src_path = get_suite_run_dir(self.suite) dst_path = get_remote_suite_run_dir(platform, self.suite) try: process = procopen(construct_rsync_over_ssh_cmd( src_path, dst_path, platform, self.rsync_includes), stdoutpipe=True, stderrpipe=True, universal_newlines=True) out, err = process.communicate(timeout=600) install_target = platform['install target'] if out: RSYNC_LOG.info( 'File installation information for ' f'{install_target}:\n {out}') if err: LOG.error( 'File installation error on ' f'{install_target}:\n {err}') except Exception as ex: LOG.error(f"Problem during rsync: {ex}") self.remote_init_map[self.install_target] = ( REMOTE_INIT_FAILED) return if "KEYSTART" in proc_ctx.out: regex_result = re.search( 'KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) suite_srv_dir = get_suite_srv_dir(self.suite) public_key = KeyInfo( KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir, install_target=self.install_target ) old_umask = os.umask(0o177) with open( public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve( domain='*', location=(client_pub_key_dir)) for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[self.install_target] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, platform['install target'], ' '.join( quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED