def max_servers(self, resource_desc): """ Returns the total of :meth:`max_servers` across all :class:`LocalAllocator` in the cluster. resource_desc: dict Description of required resources. """ credentials = get_credentials() key = 'allocator' value = resource_desc.get(key, '') if value: if self.name != value: return 0 else: # Any host in our cluster is OK. resource_desc = resource_desc.copy() del resource_desc[key] with self._lock: # Drain _reply_q. while True: try: self._reply_q.get_nowait() except Queue.Empty: break # Get counts via worker threads. todo = [] max_workers = 10 for i, allocator in enumerate(self._allocators.values()): if i < max_workers: worker_q = WorkerPool.get() worker_q.put((self._get_count, (allocator, resource_desc, credentials), {}, self._reply_q)) else: todo.append(allocator) # Process counts. total = 0 for i in range(len(self._allocators)): worker_q, retval, exc, trace = self._reply_q.get() if exc: self._logger.error(trace) raise exc try: next_allocator = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: worker_q.put((self._get_count, (next_allocator, resource_desc, credentials), {}, self._reply_q)) count = retval if count: total += count return total
def test_access_controller(self): logging.debug('') logging.debug('test_access_controller') # Credential-to-role mapping. owner = get_credentials() controller = AccessController() self.assertEqual(controller.get_role(None), '') self.assertEqual(controller.get_role(owner), 'owner') user = Credentials() user.user = '******' self.assertEqual(controller.get_role(user), 'user') assert_raises(self, 'controller.get_role(object())', globals(), locals(), TypeError, 'credentials is not a Credentials object') # Proxy role-to-credential mapping. obj = Object() assert_raises(self, 'controller.get_proxy_credentials(obj.no_rbac, user)', globals(), locals(), RoleError, 'No RBAC for method') self.assertEqual(controller.get_proxy_credentials(obj.single_role, user), user) self.assertEqual(controller.get_proxy_credentials(obj.proxy_role, user), owner) assert_raises(self, 'controller.get_proxy_credentials(obj.proxy_other, user)', globals(), locals(), RoleError, 'No credentials for proxy role other') assert_raises(self, "controller.set_proxy_credentials('other', object())", globals(), locals(), TypeError, 'credentials is not a Credentials object') other = Credentials() other.user = '******' controller.set_proxy_credentials('other', other) self.assertEqual(controller.get_proxy_credentials(obj.proxy_other, user), other) # Attribute access. controller.check_access('user', '__getattr__', obj, 'dummy') controller.check_access('owner', '__setattr__', obj, 'dummy') assert_raises(self, "controller.check_access('user', '__delattr__', obj, 'dummy')", globals(), locals(), RoleError, "No __delattr__ access to 'dummy' by role 'user'") assert_raises(self, "controller.check_access('', '__getattr__', obj, 'dummy')", globals(), locals(), RoleError, 'No access by null role') # Attribute proxying. proxy_value = ProxyRequired() self.assertFalse(controller.need_proxy(obj, 'dummy', proxy_value)) controller.attr_proxy_required(obj, 'dummy') self.assertTrue(controller.need_proxy(obj, 'dummy', proxy_value)) controller.attr_proxy_required(obj, 'dummy', False) self.assertFalse(controller.need_proxy(obj, 'dummy', proxy_value)) controller.class_proxy_required(ProxyRequired) self.assertTrue(controller.need_proxy(obj, 'dummy', proxy_value))
def cleanup(self): """ Shut-down all remaining :class:`ObjServers`. """ self._logger.debug('cleanup') cleanup_creds = get_credentials() servers = self._managers.keys() for server in servers: # Cleanup overrides release() 'owner' protection. set_credentials(self._managers[server][2]) try: self.release(server) finally: set_credentials(cleanup_creds) self._managers = {}
def test_rsh(self): logging.debug('') logging.debug('test_rsh') testdir = 'external_rsh' if os.path.exists(testdir): shutil.rmtree(testdir, onerror=onerror) os.mkdir(testdir) os.chdir(testdir) factory = None try: # Try to set command line on remote ExternalCode instance. typname = 'openmdao.lib.components.external_code.ExternalCode' factory = ObjServerFactory(allowed_types=[typname]) exec_comp = factory.create(typname) try: exec_comp.command = ['this-should-fail'] except RemoteError as exc: msg = "RoleError: No __setattr__ access to 'command'" logging.debug('msg: %s', msg) logging.debug('exc: %s', exc) self.assertTrue(msg in str(exc)) else: self.fail('Expected RemoteError') exec_comp.set('command', ['this-should-pass']) # Try to set via remote-looking access. creds = get_credentials() creds.client_creds = Credentials() logging.debug(' using %s', creds) try: exec_comp.set('command', ['this-should-fail']) except RemoteError as exc: fragment = ": 'command' may not be set() remotely" if fragment not in str(exc): self.fail('%s not in %s' % (fragment, exc)) finally: creds.client_creds = None finally: if factory is not None: factory.cleanup() os.chdir('..') if sys.platform == 'win32': time.sleep(2) # Wait for process shutdown. keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0')) if not keep_dirs: shutil.rmtree(testdir, onerror=onerror)
def test_rsh(self): logging.debug("") logging.debug("test_rsh") testdir = "external_rsh" if os.path.exists(testdir): shutil.rmtree(testdir) os.mkdir(testdir) os.chdir(testdir) factory = None try: # Try to set command line on remote ExternalCode instance. typname = "openmdao.lib.components.external_code.ExternalCode" factory = ObjServerFactory(allowed_types=[typname]) exec_comp = factory.create(typname) cmd = exec_comp.command try: exec_comp.command = ["this-should-fail"] except RemoteError as exc: msg = "RoleError: No __setattr__ access to 'command'" logging.debug("msg: %s", msg) logging.debug("exc: %s", exc) self.assertTrue(msg in str(exc)) else: self.fail("Expected RemoteError") exec_comp.set("command", ["this-should-pass"]) # Try to set via remote-looking access. creds = get_credentials() creds.client_creds = Credentials() logging.debug(" using %s", creds) try: code = "exec_comp.set('command', ['this-should-fail'])" assert_raises(self, code, globals(), locals(), RuntimeError, ": 'command' may not be set() remotely") finally: creds.client_creds = None finally: if factory is not None: factory.cleanup() os.chdir("..") if sys.platform == "win32": time.sleep(2) # Wait for process shutdown. keep_dirs = int(os.environ.get("OPENMDAO_KEEPDIRS", "0")) if not keep_dirs: shutil.rmtree(testdir)
def start_factory(self, port=None, allowed_users=None): """ Start each factory process in a unique directory. """ global _SERVER_ID _SERVER_ID += 1 server_dir = 'Factory_%d' % _SERVER_ID if os.path.exists(server_dir): shutil.rmtree(server_dir) os.mkdir(server_dir) os.chdir(server_dir) self.server_dirs.append(server_dir) try: logging.debug('') logging.debug('tester pid: %s', os.getpid()) logging.debug('starting server...') if port is None: # Exercise both AF_INET and AF_UNIX/AF_PIPE. port = -1 if _SERVER_ID & 1 else 0 if allowed_users is None: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} allowed_types = ['openmdao.main.test.test_distsim.HollowSphere', 'openmdao.main.test.test_distsim.Box', 'openmdao.main.test.test_distsim.ProtectedBox'] server, server_cfg = start_server(port=port, allowed_users=allowed_users, allowed_types=allowed_types, log_prefix=server_dir) self.servers.append(server) cfg = read_server_config(server_cfg) self.address = cfg['address'] self.port = cfg['port'] self.tunnel = cfg['tunnel'] self.key = cfg['key'] logging.debug('server pid: %s', server.pid) logging.debug('server address: %s', self.address) logging.debug('server port: %s', self.port) logging.debug('server key: %s', self.key) finally: os.chdir('..') factory = connect(self.address, self.port, self.tunnel, pubkey=self.key) self.factories.append(factory) logging.debug('factory: %r', factory) return factory
def start_factory(self, port=None, allowed_users=None): """ Start each factory process in a unique directory. """ global _SERVER_ID _SERVER_ID += 1 server_dir = "Factory_%d" % _SERVER_ID if os.path.exists(server_dir): shutil.rmtree(server_dir) os.mkdir(server_dir) os.chdir(server_dir) self.server_dirs.append(server_dir) try: logging.debug("") logging.debug("tester pid: %s", os.getpid()) logging.debug("starting server...") if port is None: # Exercise both AF_INET and AF_UNIX/AF_PIPE. port = -1 if _SERVER_ID & 1 else 0 if allowed_users is None: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} allowed_types = [ "openmdao.main.test.test_distsim.HollowSphere", "openmdao.main.test.test_distsim.Box", "openmdao.main.test.test_distsim.ProtectedBox", ] server, server_cfg = start_server(port=port, allowed_users=allowed_users, allowed_types=allowed_types) self.servers.append(server) cfg = read_server_config(server_cfg) self.address = cfg["address"] self.port = cfg["port"] self.tunnel = cfg["tunnel"] self.key = cfg["key"] logging.debug("server pid: %s", server.pid) logging.debug("server address: %s", self.address) logging.debug("server port: %s", self.port) logging.debug("server key: %s", self.key) finally: os.chdir("..") factory = connect(self.address, self.port, self.tunnel, pubkey=self.key) self.factories.append(factory) logging.debug("factory: %r", factory) return factory
def release(self, server): """ Shut-down :class:`ObjServer` `server`. server: :class:`ObjServer` Server to be shut down. """ try: address = server._token.address except AttributeError: address = 'not-a-proxy' self._logger.debug('release %r', server) self._logger.debug(' at %r', address) try: manager, root_dir, owner = self._managers[server] except KeyError: # Not identical to any of our proxies. # Could still be a reference to the same remote object. try: server_host = server.host server_pid = server.pid except Exception: self._logger.error("release: can't identify server at %r", address) raise ValueError("can't identify server at %r" % (address,)) for key in self._managers.keys(): if key.host == server_host and key.pid == server_pid: manager, root_dir, owner = self._managers[key] server = key break else: self._logger.error('release: server %r not found', server) for key in self._managers.keys(): self._logger.debug(' %r', key) self._logger.debug(' at %r', key._token.address) raise ValueError('server %r not found' % server) if get_credentials().user != owner.user: raise RoleError('only the owner can release') manager.shutdown() server._close.cancel() del self._managers[server] keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0')) if not keep_dirs and os.path.exists(root_dir): shutil.rmtree(root_dir)
def release(self, server): """ Shut-down :class:`ObjServer` `server`. server: :class:`ObjServer` Server to be shut down. """ try: address = server._token.address except AttributeError: address = 'not-a-proxy' self._logger.debug('release %r', server) self._logger.debug(' at %r', address) try: manager, root_dir, owner = self._managers[server] except KeyError: # Not identical to any of our proxies. # Could still be a reference to the same remote object. try: server_host = server.host server_pid = server.pid except Exception: self._logger.error("release: can't identify server at %r", address) raise ValueError("can't identify server at %r" % (address, )) for key in self._managers.keys(): if key.host == server_host and key.pid == server_pid: manager, root_dir, owner = self._managers[key] server = key break else: self._logger.error('release: server %r not found', server) for key in self._managers.keys(): self._logger.debug(' %r', key) self._logger.debug(' at %r', key._token.address) raise ValueError('server %r not found' % server) if get_credentials().user != owner.user: raise RoleError('only the owner can release') manager.shutdown() server._close.cancel() del self._managers[server] keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0')) if not keep_dirs and os.path.exists(root_dir): shutil.rmtree(root_dir)
def load_model(self, egg_filename): """ Load model from egg and return top-level object if this server's `allow_shell` attribute is True. egg_filename: string Filename of egg to be loaded. """ self._logger.debug('load_model %r', egg_filename) if not self._allow_shell: self._logger.error('attempt to load %r by %r', egg_filename, get_credentials().user) raise RuntimeError('shell access is not allowed by this server') self._check_path(egg_filename, 'load_model') if self.tlo: self.tlo.pre_delete() self.tlo = Container.load_from_eggfile(egg_filename, log=self._logger) return self.tlo
def deploy(self,name, resource_desc,criteria): """ Deploy a server suitable for `resource_desc`. Returns a proxy to the deployed server. name: string Name for server. resource_desc: dict Description of required resources. criteria: dict The dictionary returned by :meth:`time_estimate`. """ hostnames = [] n_cpus=resource_desc['min_cpus'] nh = 0 for i,worker in enumerate(self.workers): if nh == n_cpus: break if worker['state'] == 1: worker['state'] = 0 hostnames.append(worker['hostname']) nh+=1 print 'allocating hosts',hostnames credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} try: server = self.factory.create(typname='', allowed_users=allowed_users, name=name) # overwrite the server's host list with the assigned hosts server.host = hostnames[0] server.mpi_resources = hostnames return server # Shouldn't happen... except Exception as exc: #pragma no cover self._logger.error('create failed: %r', exc) return None
def execute_command(self, command, stdin, stdout, stderr, env_vars, poll_delay, timeout): """ Run `command` in a subprocess if this server's `allow_shell` attribute is True. command: string Command line to be executed. stdin, stdout, stderr: string Filenames for the corresponding stream. env_vars: dict Environment variables for the command. poll_delay: float (seconds) Delay between polling subprocess for completion. timeout: float (seconds) Maximum time to wait for command completion. A value of zero implies no timeout. """ self._logger.debug('execute_command %r', command) if not self._allow_shell: self._logger.error('attempt to execute %r by %r', command, get_credentials().user) raise RuntimeError('shell access is not allowed by this server') for arg in (stdin, stdout, stderr): if isinstance(arg, basestring): self._check_path(arg, 'execute_command') try: process = ShellProc(command, stdin, stdout, stderr, env_vars) except Exception as exc: self._logger.error('exception creating process: %s', exc) raise self._logger.debug(' PID = %d', process.pid) return_code, error_msg = process.wait(poll_delay, timeout) self._logger.debug(' returning %s', (return_code, error_msg)) return (return_code, error_msg)
def deploy(self, name, resource_desc, criteria): """ Deploy a server suitable for `resource_desc`. Returns a proxy to the deployed server. name: string Name for server. resource_desc: dict Description of required resources. criteria: dict The dictionary returned by :meth:`time_estimate`. """ credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} try: return self.create(typname="", allowed_users=allowed_users, name=name) # Shouldn't happen... except Exception as exc: # pragma no cover self._logger.error("create failed: %r", exc) return None
def deploy(self, name, resource_desc, criteria): """ Deploy a server suitable for `resource_desc`. Returns a proxy to the deployed server. name: string Name for server. resource_desc: dict Description of required resources. criteria: dict The dictionary returned by :meth:`time_estimate`. """ credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} try: return self.create(typname='', allowed_users=allowed_users, name=name) # Shouldn't happen... except Exception as exc: #pragma no cover self._logger.error('create failed: %r', exc) return None
def start_manager(self, index, authkey, address, files, allow_shell=False): """ Launch remote manager process via `ssh`. The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid removal of the temporary directory used on the host. index: int Index in parent cluster. authkey: string Authorization key used to connect to host server. address: (ip_addr, port) or string referring to pipe. Address to use to connect back to parent. files: list(string) Files to be sent to support server startup. allow_shell: bool If True, :meth:`execute_command` and :meth:`load_model` are allowed in created servers. Use with caution! """ try: self._check_ssh() except RuntimeError: self.state = 'failed' return self.tempdir = self._copy_to_remote(files) if not self.tempdir: self.state = 'failed' return _LOGGER.debug('startup files copied to %s:%s', self.hostname, self.tempdir) if self.tunnel_incoming: _LOGGER.debug('setup reverse tunnel from %s to %s:%s', self.hostname, address[0], address[1]) address, cleanup = \ setup_reverse_tunnel(self.hostname, address[0], address[1], identity=self.identity_filename) self.reverse_cleanup = cleanup cmd = self._ssh_cmd() cmd.extend([ self.hostname, self.python, '-c', '"import sys;' ' sys.path.append(\'.\');' ' import os;' ' os.chdir(\'%s\');' ' from mp_distributing import main;' ' main()"' % self.tempdir ]) self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} # Tell the server what name to bind to # (in case it has multiple interfaces). user, remote_name = self.hostname.split('@') data = dict( name='BoostrappingHost', index=index, hostname=remote_name, # Avoid lots of SUBDEBUG messages. dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG), dir=self.tempdir, authkey=str(authkey), allowed_users=allowed_users, allow_shell=allow_shell, allow_tunneling=self.tunnel_outgoing, parent_address=address, registry=self.registry, keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0')) # Windows can't handle binary on stdin. dump = cPickle.dumps(data, cPickle.HIGHEST_PROTOCOL) dump = base64.b64encode(dump) _LOGGER.debug('sending %s config info (%s)', self.hostname, len(dump)) self.proc.stdin.write(dump) self.proc.stdin.close() time.sleep(1) # Give the proc time to register startup problems. self.poll() if self.state != 'failed': self.state = 'started'
logging.debug('exc: %s', exc) self.assertTrue(msg in str(exc)) else: self.fail('Expected RemoteError') try: model.box.proprietary_method() except RemoteError as exc: msg = "RoleError: proprietary_method(): No access for role 'user'" logging.debug('msg: %s', msg) logging.debug('exc: %s', exc) self.assertTrue(msg in str(exc)) else: self.fail('Expected RemoteError') saved = get_credentials() set_credentials(spook) try: i = model.box.secret model.box.proprietary_method() finally: # Reset credentials to allow factory shutdown. set_credentials(saved) def test_4_authkey(self): logging.debug('') logging.debug('test_authkey') factory = self.start_factory() # Start server in non-public-key mode.
def test_3_access(self): logging.debug('') logging.debug('test_access') # This 'spook' creation is only for testing. # Normally the protector would run with regular credentials # in effect at the proprietary site. user = '******'+socket.gethostname() key_pair = get_key_pair(user) data = '\n'.join([user, '0', key_pair.publickey().exportKey()]) hash = hashlib.sha256(data).digest() signature = key_pair.sign(hash, get_random_bytes) spook = Credentials((data, signature, None)) credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key, spook.user: spook.public_key} factory = self.start_factory(allowed_users=allowed_users) # Create model and run it. saved = get_credentials() set_credentials(spook) box = factory.create(_MODULE+'.ProtectedBox', allowed_users=allowed_users) set_credentials(saved) model = set_as_top(Model(box)) model.run() # Check results. for width in range(1, 2): for height in range(1, 3): for depth in range(1, 4): case = model.driver.recorder.cases.pop(0) self.assertEqual(case.outputs[0][2], width*height*depth) # Check access protections. try: i = model.box.secret except RemoteError as exc: msg = "RoleError: No __getattribute__ access to 'secret' by role 'user'" logging.debug('msg: %s', msg) logging.debug('exc: %s', exc) self.assertTrue(msg in str(exc)) else: self.fail('Expected RemoteError') try: model.box.proprietary_method() except RemoteError as exc: msg = "RoleError: proprietary_method(): No access for role 'user'" logging.debug('msg: %s', msg) logging.debug('exc: %s', exc) self.assertTrue(msg in str(exc)) else: self.fail('Expected RemoteError') saved = get_credentials() set_credentials(spook) try: i = model.box.secret model.box.proprietary_method() finally: # Reset credentials to allow factory shutdown. set_credentials(saved)
def create(self, typname, version=None, server=None, res_desc=None, **ctor_args): """ Create a new `typname` object in `server` or a new :class:`ObjectServer`. Returns a proxy for for the new object. Starts servers in a subdirectory of the current directory. typname: string Type of object to create. If null, then a proxy for the new :class:`ObjServer` is returned. version: string or None Version of `typname` to create. server: proxy :class:`ObjServer` on which to create `typname`. If none, then a new server is created. res_desc: dict or None Required resources. Currently not used. ctor_args: dict Other constructor arguments. If `name` or `allowed_users` are specified, they are used when creating the :class:`ObjServer`. If no `allowed_users` are specified, the server is private to the current user. """ self._logger.info( 'create typname %r, version %r server %s,' ' res_desc %s, args %s', typname, version, server, res_desc, ctor_args) if server is None: name = ctor_args.get('name', '') if not name: name = 'Server_%d' % (len(self._managers) + 1) allowed_users = ctor_args.get('allowed_users') if not allowed_users: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} else: del ctor_args['allowed_users'] if self._address is None or \ isinstance(self._address, basestring) or \ self._allow_tunneling: # Local access only via pipe if factory accessed by pipe # or factory is accessed via tunnel. address = None else: # Network access via same IP as factory, system-selected port. address = (self._address[0], 0) manager = self.manager_class(address, self._authkey, name=name, allowed_users=allowed_users) root_dir = name count = 1 while os.path.exists(root_dir): count += 1 root_dir = '%s_%d' % (name, count) os.mkdir(root_dir) # On Windows, when running the full test suite under Nose, # starting the process starts a new Nose test session, which # will eventually get here and start a new Nose session, which... orig_main = None if sys.platform == 'win32': #pragma no cover scripts = ('openmdao-script.py', 'openmdao_test-script.py') try: main_file = sys.modules['__main__'].__file__ except AttributeError: pass else: if main_file.endswith(scripts): orig_main = main_file sys.modules['__main__'].__file__ = \ pkg_resources.resource_filename('openmdao.main', 'objserverfactory.py') owner = get_credentials() self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s', owner, name, root_dir) try: manager.start(cwd=root_dir, log_level=self._logger.getEffectiveLevel()) finally: if orig_main is not None: #pragma no cover sys.modules['__main__'].__file__ = orig_main self._logger.info('new server %r for %s', name, owner) self._logger.info(' in dir %s', root_dir) self._logger.info(' listening on %s', manager.address) server_class = getattr(manager, self.server_classname) server = server_class(name=name, allow_shell=self._allow_shell, allowed_types=self._allowed_types) self._managers[server] = (manager, root_dir, owner) if typname: obj = server.create(typname, version, None, res_desc, **ctor_args) else: obj = server self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj, obj._token.address) return obj
def start(self): """ Start this manager and all remote managers. """ super(Cluster, self).start() hostname = socket.getfqdn() listener = connection.Listener(address=(hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # TODO: support multiple addresses if multiple networks are attached. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # This will hang if server doesn't receive our address. conn = listener.accept() i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. continue other_host.manager = HostManager.from_address( address, self._authkey) other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 600: # ~60 seconds. time.sleep(0.1) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) self._base_shutdown = self.shutdown del self.shutdown
def execute_command(self, resource_desc): """ Run command described by `resource_desc` in a subprocess if this server's `allow_shell` attribute is True. resource_desc: dict Contains job description. The current environment, along with any 'job_environment' specification, is in effect while running 'remote_command'. If 'input_path' is not specified, ``/dev/null`` or ``nul:`` is used. If 'output_path' is not specified, ``<remote_command>.stdout`` is used. If neither 'error_path' nor 'join_files' are specified, ``<remote_command>.stderr`` is used. If specified in the 'resource_limits' dictionary, 'wallclock_time' is used as a timeout. All other queuing resource keys are ignored. The ``HOME_DIRECTORY`` and ``WORKING_DIRECTORY`` placeholders are ignored. """ try: job_name = resource_desc['job_name'] except KeyError: job_name = '' command = resource_desc['remote_command'] self._check_path(command, 'execute_command') base = os.path.basename(command) command = [command] if 'args' in resource_desc: command.extend(resource_desc['args']) self._logger.debug('execute_command %s %r', job_name, command) if not self._allow_shell: self._logger.error('attempt to execute %r by %r', command, get_credentials().user) raise RuntimeError('shell access is not allowed by this server') env_vars = resource_desc.get('job_environment') try: stdin = resource_desc['input_path'] self._check_path(stdin, 'execute_command') except KeyError: stdin = DEV_NULL try: stdout = resource_desc['output_path'] self._check_path(stdout, 'execute_command') except KeyError: stdout = base+'.stdout' try: stderr = resource_desc['error_path'] self._check_path(stderr, 'execute_command') except KeyError: try: join_files = resource_desc['join_files'] except KeyError: stderr = base+'.stderr' else: stderr = STDOUT if join_files else base+'.stderr' limits = resource_desc.get('resource_limits', {}) timeout = limits.get('wallclock_time', 0) poll_delay = 1 try: process = ShellProc(command, stdin, stdout, stderr, env_vars) except Exception as exc: self._logger.error('exception creating process: %s', exc) raise self._logger.debug(' PID = %d', process.pid) return_code, error_msg = process.wait(poll_delay, timeout) self._logger.debug(' returning %s', (return_code, error_msg)) return (return_code, error_msg)
def start(self): """ Start this manager and all remote managers. If some managers fail to start, errors are logged and the corresponding host's state is set to ``failed``. You can use ``len(cluster)`` to determine how many remote managers are available. A :class:`RuntimeError` will be raised if no managers were successfully started. """ super(Cluster, self).start() listener = connection.Listener(address=(self._hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # Normal accept() can hang. retval = [] accepter = threading.Thread(target=self._accept, args=(listener, retval), name='ClusterAccepter') accepter.daemon = True accepter.start() accepter.join(30) if accepter.is_alive(): msg = 'timeout waiting for reply from %s' \ % [host.hostname for host in self._hostlist if host.state == 'started'] _LOGGER.error(msg) for host in self._hostlist: if host.state == 'started': if host.proc is not None: host.proc.terminate() if host.reverse_cleanup is not None: host.reverse_cleanup[0](*host.reverse_cleanup[1:]) host.state = 'failed' continue conn = retval[0] i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. other_host.state = 'failed' continue try: other_host.manager = \ HostManager.from_address(address, self._authkey, other_host) except Exception as exc: _LOGGER.error("Can't start manager for %s: %s", other_host.hostname, str(exc) or repr(exc)) if other_host.proc is not None: other_host.proc.terminate() other_host.state = 'failed' continue else: other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) # So our class defined shutdown() is called before the superclass # installed shutdown(). self._base_shutdown = self.shutdown del self.shutdown if len(self._up) < 1: raise RuntimeError('No hosts successfully started')
def time_estimate(self, resource_desc): """ Returns ``(estimate, criteria)`` indicating how well this allocator can satisfy the `resource_desc` request. The estimate will be: - >0 for an estimate of walltime (seconds). - 0 for no estimate. - -1 for no resource at this time. - -2 for no support for `resource_desc`. The returned criteria is a dictionary containing information related to the estimate, such as hostnames, load averages, unsupported resources, etc. This allocator polls each :class:`LocalAllocator` in the cluster to find the best match and returns that. The best allocator is saved in the returned criteria for a subsequent :meth:`deploy`. resource_desc: dict Description of required resources. """ credentials = get_credentials() key = "allocator" value = resource_desc.get(key, "") if value: if self.name != value: return (-2, {key: value}) else: # Any host in our cluster is OK. resource_desc = resource_desc.copy() del resource_desc[key] n_cpus = resource_desc.get("n_cpus", 0) if n_cpus: # Spread across LocalAllocators. resource_desc = resource_desc.copy() resource_desc["n_cpus"] = 1 with self._lock: best_estimate = -2 best_criteria = None best_allocator = None # Prefer not to repeat use of just-used allocator. prev_estimate = -2 prev_criteria = None prev_allocator = self._last_deployed self._last_deployed = None # Drain _reply_q. while True: try: self._reply_q.get_nowait() except Queue.Empty: break # Get estimates via worker threads. todo = [] max_workers = 10 for i, allocator in enumerate(self._allocators.values()): if i < max_workers: worker_q = WorkerPool.get() worker_q.put((self._get_estimate, (allocator, resource_desc, credentials), {}, self._reply_q)) else: todo.append(allocator) # Process estimates. host_loads = [] # Sorted list of (hostname, load) for i in range(len(self._allocators)): worker_q, retval, exc, trace = self._reply_q.get() if exc: self._logger.error(trace) retval = None try: next_allocator = todo.pop(0) except IndexError: WorkerPool.release(worker_q) else: worker_q.put((self._get_estimate, (next_allocator, resource_desc, credentials), {}, self._reply_q)) if retval is None: continue allocator, estimate, criteria = retval if estimate is None: continue # Update loads. if estimate >= 0 and n_cpus: load = criteria["loadavgs"][0] new_info = (criteria["hostnames"][0], load) if host_loads: for i, info in enumerate(host_loads): if load < info[1]: host_loads.insert(i, new_info) break else: host_loads.append(new_info) else: host_loads.append(new_info) # Update best estimate. if allocator is prev_allocator: prev_estimate = estimate prev_criteria = criteria elif (best_estimate <= 0 and estimate > best_estimate) or ( best_estimate > 0 and estimate < best_estimate ): best_estimate = estimate best_criteria = criteria best_allocator = allocator elif best_estimate == 0 and estimate == 0: best_load = best_criteria["loadavgs"][0] load = criteria["loadavgs"][0] if load < best_load: best_estimate = estimate best_criteria = criteria best_allocator = allocator # If no alternative, repeat use of previous allocator. if best_estimate < 0 and prev_estimate >= 0: best_estimate = prev_estimate best_criteria = prev_criteria best_allocator = prev_allocator # Save best allocator in criteria in case we're asked to deploy. if best_criteria is not None: best_criteria["allocator"] = best_allocator # Save n_cpus hostnames in criteria. best_criteria["hostnames"] = [host_loads[i][0] for i in range(min(n_cpus, len(host_loads)))] return (best_estimate, best_criteria)
def execute_command(self, resource_desc): """ Run command described by `resource_desc` in a subprocess if this server's `allow_shell` attribute is True. resource_desc: dict Contains job description. The current environment, along with any 'job_environment' specification, is in effect while running 'remote_command'. If 'input_path' is not specified, ``/dev/null`` or ``nul:`` is used. If 'output_path' is not specified, ``<remote_command>.stdout`` is used. If neither 'error_path' nor 'join_files' are specified, ``<remote_command>.stderr`` is used. If specified in the 'resource_limits' dictionary, 'wallclock_time' is used as a timeout. All other queuing resource keys are ignored. The ``HOME_DIRECTORY`` and ``WORKING_DIRECTORY`` placeholders are ignored. """ try: job_name = resource_desc['job_name'] except KeyError: job_name = '' command = resource_desc['remote_command'] self._check_path(command, 'execute_command') base = os.path.basename(command) command = [command] if 'args' in resource_desc: command.extend(resource_desc['args']) self._logger.debug('execute_command %s %r', job_name, command) if not self._allow_shell: self._logger.error('attempt to execute %r by %r', command, get_credentials().user) raise RuntimeError('shell access is not allowed by this server') env_vars = resource_desc.get('job_environment') try: stdin = resource_desc['input_path'] self._check_path(stdin, 'execute_command') except KeyError: stdin = DEV_NULL try: stdout = resource_desc['output_path'] self._check_path(stdout, 'execute_command') except KeyError: stdout = base + '.stdout' try: stderr = resource_desc['error_path'] self._check_path(stderr, 'execute_command') except KeyError: try: join_files = resource_desc['join_files'] except KeyError: stderr = base + '.stderr' else: stderr = STDOUT if join_files else base + '.stderr' limits = resource_desc.get('resource_limits', {}) timeout = limits.get('wallclock_time', 0) poll_delay = 1 try: process = ShellProc(command, stdin, stdout, stderr, env_vars) except Exception as exc: self._logger.error('exception creating process: %s', exc) raise self._logger.debug(' PID = %d', process.pid) return_code, error_msg = process.wait(poll_delay, timeout) self._logger.debug(' returning %s', (return_code, error_msg)) return (return_code, error_msg)
def create(self, typname, version=None, server=None, res_desc=None, **ctor_args): """ Create a new `typname` object in `server` or a new :class:`ObjectServer`. Returns a proxy for for the new object. Starts servers in a subdirectory of the current directory. typname: string Type of object to create. If null, then a proxy for the new :class:`ObjServer` is returned. version: string or None Version of `typname` to create. server: proxy :class:`ObjServer` on which to create `typname`. If none, then a new server is created. res_desc: dict or None Required resources. ``working_directory`` is used to set a created server's directory, other keys are ignored. If `allow_shell` has been set, then an absolute directory reference may be used (including '~' expansion). If not, then the reference must be relative and the working directory will be relative to the factory's directory. If the directory already exists, a new name will be used of the form ``<directory>_N`` ctor_args: dict Other constructor arguments. If `name` or `allowed_users` are specified, they are used when creating the :class:`ObjServer`. If no `allowed_users` are specified, the server is private to the current user. """ self._logger.info( 'create typname %r, version %r server %s,' ' res_desc %s, args %s', typname, version, server, res_desc, ctor_args) if server is None: name = ctor_args.get('name', '') if not name: name = 'Server_%d' % (len(self._managers) + 1) allowed_users = ctor_args.get('allowed_users') if not allowed_users: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} else: del ctor_args['allowed_users'] if self._address is None or \ isinstance(self._address, basestring) or \ self._allow_tunneling: # Local access only via pipe if factory accessed by pipe # or factory is accessed via tunnel. address = None else: # Network access via same IP as factory, system-selected port. address = (self._address[0], 0) manager = self.manager_class(address, self._authkey, name=name, allowed_users=allowed_users) # Set (unique) working directory of server. # Server cleanup removes this directory, so we avoid any # existing directory to not delete existing files. base = None if res_desc is not None: base = res_desc.get('working_directory') if base: if self._allow_shell: # Absolute allowed. base = os.path.expanduser(base) elif os.path.isabs(base) or base.startswith('..'): raise ValueError( 'working_directory %r must be subdirectory' % base) res_desc = res_desc.copy() del res_desc['working_directory'] if not base: base = name count = 1 root_dir = base while os.path.exists(root_dir): count += 1 root_dir = '%s_%d' % (base, count) os.mkdir(root_dir) # On Windows, when running the full test suite under Nose, # starting the process starts a new Nose test session, which # will eventually get here and start a new Nose session, which... orig_main = None if sys.platform == 'win32': # pragma no cover scripts = ('openmdao-script.py', 'openmdao_test-script.py') try: main_file = sys.modules['__main__'].__file__ except AttributeError: pass else: if main_file.endswith(scripts): orig_main = main_file sys.modules['__main__'].__file__ = \ pkg_resources.resource_filename('openmdao.main', 'objserverfactory.py') owner = get_credentials() self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s', owner, name, root_dir) try: manager.start(cwd=root_dir, log_level=self._logger.getEffectiveLevel()) finally: if orig_main is not None: # pragma no cover sys.modules['__main__'].__file__ = orig_main self._logger.info('new server %r for %s', name, owner) self._logger.info(' in dir %s', root_dir) self._logger.info(' listening on %s', manager.address) server_class = getattr(manager, self.server_classname) server = server_class(name=name, allow_shell=self._allow_shell, allowed_types=self._allowed_types) self._managers[server] = (manager, root_dir, owner) if typname: obj = server.create(typname, version, None, res_desc, **ctor_args) else: obj = server self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj, obj._token.address) return obj
def start_manager(self, index, authkey, address, files, allow_shell=False): """ Launch remote manager process via `ssh`. The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid removal of the temporary directory used on the host. index: int Index in parent cluster. authkey: string Authorization key used to connect to host server. address: (ip_addr, port) or string referring to pipe. Address to use to connect back to parent. files: list(string) Files to be sent to support server startup. allow_shell: bool If True, :meth:`execute_command` and :meth:`load_model` are allowed in created servers. Use with caution! """ try: _check_ssh(self.hostname) except Exception: self.state = 'failed' return self.tempdir = _copy_to_remote(self.hostname, files, self.python) _LOGGER.debug('startup files copied to %s:%s', self.hostname, self.tempdir) cmd = copy.copy(_SSH) cmd.extend([self.hostname, self.python, '-c', '"import sys;' ' sys.path.append(\'.\');' ' import os;' ' os.chdir(\'%s\');' ' from mp_distributing import main;' ' main()"' % self.tempdir]) self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} data = dict( name='BoostrappingHost', index=index, # Avoid lots of SUBDEBUG messages. dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG), dir=self.tempdir, authkey=str(authkey), allowed_users=allowed_users, allow_shell=allow_shell, parent_address=address, registry=self.registry, keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0') ) cPickle.dump(data, self.proc.stdin, cPickle.HIGHEST_PROTOCOL) self.proc.stdin.close() # TODO: put timeout in accept() to avoid this hack. time.sleep(1) # Give the proc time to register startup problems. self.poll() if self.state != 'failed': self.state = 'started'
def start(self): """ Start this manager and all remote managers. If some managers fail to start, errors are logged and the corresponding host's state is set to ``failed``. You can use ``len(cluster)`` to determine how many remote managers are available. A :class:`RuntimeError` will be raised if no managers were successfully started. """ super(Cluster, self).start() listener = connection.Listener(address=(self._hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # Normal accept() can hang. retval = [] accepter = threading.Thread(target=self._accept, args=(listener, retval), name='ClusterAccepter') accepter.daemon = True accepter.start() accepter.join(30) if accepter.is_alive(): msg = 'timeout waiting for reply from %s' \ % [host.hostname for host in self._hostlist if host.state == 'started'] _LOGGER.error(msg) for host in self._hostlist: if host.state == 'started': if host.proc is not None: host.proc.terminate() if host.reverse_cleanup is not None: host.reverse_cleanup[0]( *host.reverse_cleanup[1:]) host.state = 'failed' continue conn = retval[0] i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. other_host.state = 'failed' continue try: other_host.manager = \ HostManager.from_address(address, self._authkey, other_host) except Exception as exc: _LOGGER.error("Can't start manager for %s: %s", other_host.hostname, str(exc) or repr(exc)) if other_host.proc is not None: other_host.proc.terminate() other_host.state = 'failed' continue else: other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) # So our class defined shutdown() is called before the superclass # installed shutdown(). self._base_shutdown = self.shutdown del self.shutdown if len(self._up) < 1: raise RuntimeError('No hosts successfully started')
def start(self): """ Start this manager and all remote managers. """ super(Cluster, self).start() hostname = socket.getfqdn() listener = connection.Listener(address=(hostname, 0), authkey=self._authkey, backlog=5) # Default is 1. # TODO: support multiple addresses if multiple networks are attached. # Start managers in separate thread to avoid losing connections. starter = threading.Thread(target=self._start_hosts, args=(listener.address, get_credentials())) starter.daemon = True starter.start() # Accept callback connections from started managers. waiting = [''] retry = 0 while waiting: host_processed = False for host in self._hostlist: host.poll() if host.state == 'started': # Accept conection from *any* host. _LOGGER.debug('waiting for a connection, host %s', host.hostname) # This will hang if server doesn't receive our address. conn = listener.accept() i, address, pubkey_text = conn.recv() conn.close() other_host = self._hostlist[i] if address is None: _LOGGER.error('Host %s died: %s', other_host.hostname, pubkey_text) # Exception text. continue other_host.manager = HostManager.from_address(address, self._authkey) other_host.state = 'up' if pubkey_text: other_host.manager._pubkey = \ decode_public_key(pubkey_text) host_processed = True _LOGGER.debug('Host %s is now up', other_host.hostname) self._up.append(other_host) # See if there are still hosts to wait for. waiting = [] for host in self._hostlist: host.poll() if host.state == 'init' or host.state == 'started': waiting.append(host) if waiting: if not host_processed: retry += 1 if retry < 300: # ~60 seconds. time.sleep(0.2) else: _LOGGER.warning('Cluster startup timeout,' ' hosts not started:') for host in waiting: _LOGGER.warning(' %s (%s) in dir %s', host.hostname, host.state, host.tempdir) break else: break self._up = sorted(self._up, key=lambda host: host.hostname) self._base_shutdown = self.shutdown del self.shutdown
def create(self, typname, version=None, server=None, res_desc=None, **ctor_args): """ Create a new `typname` object in `server` or a new :class:`ObjectServer`. Returns a proxy for for the new object. Starts servers in a subdirectory of the current directory. typname: string Type of object to create. If null, then a proxy for the new :class:`ObjServer` is returned. version: string or None Version of `typname` to create. server: proxy :class:`ObjServer` on which to create `typname`. If none, then a new server is created. res_desc: dict or None Required resources. Currently not used. ctor_args: dict Other constructor arguments. If `name` or `allowed_users` are specified, they are used when creating the :class:`ObjServer`. If no `allowed_users` are specified, the server is private to the current user. """ self._logger.info('create typname %r, version %r server %s,' ' res_desc %s, args %s', typname, version, server, res_desc, ctor_args) if server is None: name = ctor_args.get('name', '') if not name: name = 'Server_%d' % (len(self._managers) + 1) allowed_users = ctor_args.get('allowed_users') if not allowed_users: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} else: del ctor_args['allowed_users'] if self._address is None or \ isinstance(self._address, basestring) or \ self._allow_tunneling: # Local access only via pipe if factory accessed by pipe # or factory is accessed via tunnel. address = None else: # Network access via same IP as factory, system-selected port. address = (self._address[0], 0) manager = self.manager_class(address, self._authkey, name=name, allowed_users=allowed_users) root_dir = name count = 1 while os.path.exists(root_dir): count += 1 root_dir = '%s_%d' % (name, count) os.mkdir(root_dir) # On Windows, when running the full test suite under Nose, # starting the process starts a new Nose test session, which # will eventually get here and start a new Nose session, which... orig_main = None if sys.platform == 'win32': #pragma no cover scripts = ('openmdao-script.py', 'openmdao_test-script.py') if sys.modules['__main__'].__file__.endswith(scripts): orig_main = sys.modules['__main__'].__file__ sys.modules['__main__'].__file__ = \ pkg_resources.resource_filename('openmdao.main', 'objserverfactory.py') owner = get_credentials() self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s', owner, name, root_dir) try: manager.start(cwd=root_dir, log_level=self._logger.getEffectiveLevel()) finally: if orig_main is not None: #pragma no cover sys.modules['__main__'].__file__ = orig_main self._logger.info('new server %r for %s', name, owner) self._logger.info(' in dir %s', root_dir) self._logger.info(' listening on %s', manager.address) server_class = getattr(manager, self.server_classname) server = server_class(name=name, allow_shell=self._allow_shell, allowed_types=self._allowed_types) self._managers[server] = (manager, root_dir, owner) if typname: obj = server.create(typname, version, None, res_desc, **ctor_args) else: obj = server self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj, obj._token.address) return obj
def _start(self): """ Start evaluating cases concurrently. """ # Need credentials in case we're using a PublicKey server. credentials = get_credentials() # Determine maximum number of servers available. resources = { 'required_distributions':self._egg_required_distributions, 'orphan_modules':self._egg_orphan_modules, 'python_version':sys.version[:3]} if self.extra_resources: resources.update(self.extra_resources) max_servers = RAM.max_servers(resources) self._logger.debug('max_servers %d', max_servers) if max_servers <= 0: msg = 'No servers supporting required resources %s' % resources self.raise_exception(msg, RuntimeError) # Kick off initial wave of cases. self._server_lock = threading.Lock() self._reply_q = Queue.Queue() self._generation += 1 n_servers = 0 while n_servers < max_servers: if not self._more_to_go(): break # Get next case. Limits servers started if max_servers > cases. try: case = self._iter.next() except StopIteration: if not self._rerun: self._iter = None self._seqno = 0 break self._seqno += 1 self._todo.append((case, self._seqno)) # Start server worker thread. n_servers += 1 name = '%s_%d_%d' % (self.name, self._generation, n_servers) self._logger.debug('starting worker for %r', name) self._servers[name] = None self._in_use[name] = True self._server_cases[name] = None self._server_states[name] = _EMPTY self._load_failures[name] = 0 server_thread = threading.Thread(target=self._service_loop, args=(name, resources, credentials, self._reply_q)) server_thread.daemon = True try: server_thread.start() except thread.error: self._logger.warning('worker thread startup failed for %r', name) self._in_use[name] = False break if sys.platform != 'win32': # Process any pending events. while self._busy(): try: name, result, exc = self._reply_q.get(True, 0.01) except Queue.Empty: break # Timeout. else: # Difficult to force startup failure. if self._servers[name] is None: #pragma nocover self._logger.debug('server startup failed for %r', name) self._in_use[name] = False else: self._in_use[name] = self._server_ready(name) if sys.platform == 'win32': #pragma no cover # Don't start server processing until all servers are started, # otherwise we have egg removal issues. for name in self._in_use.keys(): name, result, exc = self._reply_q.get() if self._servers[name] is None: self._logger.debug('server startup failed for %r', name) self._in_use[name] = False # Kick-off started servers. for name in self._in_use.keys(): if self._in_use[name]: self._in_use[name] = self._server_ready(name) # Continue until no servers are busy. while self._busy(): if self._more_to_go(): timeout = None else: # Don't wait indefinitely for a server we don't need. # This has happened with a server that got 'lost' # in RAM.allocate() timeout = 60 try: name, result, exc = self._reply_q.get(timeout=timeout) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover msgs = [] for name, in_use in self._in_use.items(): if in_use: try: server = self._servers[name] info = self._server_info[name] except KeyError: msgs.append('%r: no startup reply' % name) self._in_use[name] = False else: state = self._server_states[name] if state not in (_LOADING, _EXECUTING): msgs.append('%r: %r %s %s' % (name, self._servers[name], state, self._server_info[name])) self._in_use[name] = False if msgs: self._logger.error('Timeout waiting with nothing left to do:') for msg in msgs: self._logger.error(' %s', msg) else: self._in_use[name] = self._server_ready(name) # Shut-down (started) servers. self._logger.debug('Shut-down (started) servers') for queue in self._queues.values(): queue.put(None) for i in range(len(self._queues)): try: name, status, exc = self._reply_q.get(True, 60) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover pass else: if name in self._queues: # 'Stale' worker can reply *late*. del self._queues[name] # Hard to force worker to hang, which is handled here. for name in self._queues.keys(): #pragma no cover self._logger.warning('Timeout waiting for %r to shut-down.', name)
def create(self, typname, version=None, server=None, res_desc=None, **ctor_args): """ Create a new `typname` object in `server` or a new :class:`ObjectServer`. Returns a proxy for for the new object. Starts servers in a subdirectory of the current directory. typname: string Type of object to create. If null, then a proxy for the new :class:`ObjServer` is returned. version: string or None Version of `typname` to create. server: proxy :class:`ObjServer` on which to create `typname`. If none, then a new server is created. res_desc: dict or None Required resources. ``working_directory`` is used to set a created server's directory, other keys are ignored. If `allow_shell` has been set, then an absolute directory reference may be used (including '~' expansion). If not, then the reference must be relative and the working directory will be relative to the factory's directory. If the directory already exists, a new name will be used of the form ``<directory>_N`` ctor_args: dict Other constructor arguments. If `name` or `allowed_users` are specified, they are used when creating the :class:`ObjServer`. If no `allowed_users` are specified, the server is private to the current user. """ self._logger.info('create typname %r, version %r server %s,' ' res_desc %s, args %s', typname, version, server, res_desc, ctor_args) if server is None: name = ctor_args.get('name', '') if not name: name = 'Server_%d' % (len(self._managers) + 1) allowed_users = ctor_args.get('allowed_users') if not allowed_users: credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} else: del ctor_args['allowed_users'] if self._address is None or \ isinstance(self._address, basestring) or \ self._allow_tunneling: # Local access only via pipe if factory accessed by pipe # or factory is accessed via tunnel. address = None else: # Network access via same IP as factory, system-selected port. address = (self._address[0], 0) manager = self.manager_class(address, self._authkey, name=name, allowed_users=allowed_users) # Set (unique) working directory of server. # Server cleanup removes this directory, so we avoid any # existing directory to not delete existing files. base = None if res_desc is not None: base = res_desc.get('working_directory') if base: if self._allow_shell: # Absolute allowed. base = os.path.expanduser(base) elif os.path.isabs(base) or base.startswith('..'): raise ValueError('working_directory %r must be subdirectory' % base) res_desc = res_desc.copy() del res_desc['working_directory'] if not base: base = name count = 1 root_dir = base while os.path.exists(root_dir): count += 1 root_dir = '%s_%d' % (base, count) os.mkdir(root_dir) # On Windows, when running the full test suite under Nose, # starting the process starts a new Nose test session, which # will eventually get here and start a new Nose session, which... orig_main = None if sys.platform == 'win32': #pragma no cover scripts = ('openmdao-script.py', 'openmdao_test-script.py') try: main_file = sys.modules['__main__'].__file__ except AttributeError: pass else: if main_file.endswith(scripts): orig_main = main_file sys.modules['__main__'].__file__ = \ pkg_resources.resource_filename('openmdao.main', 'objserverfactory.py') owner = get_credentials() self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s', owner, name, root_dir) try: manager.start(cwd=root_dir, log_level=self._logger.getEffectiveLevel()) finally: if orig_main is not None: #pragma no cover sys.modules['__main__'].__file__ = orig_main self._logger.info('new server %r for %s', name, owner) self._logger.info(' in dir %s', root_dir) self._logger.info(' listening on %s', manager.address) server_class = getattr(manager, self.server_classname) server = server_class(name=name, allow_shell=self._allow_shell, allowed_types=self._allowed_types) self._managers[server] = (manager, root_dir, owner) if typname: obj = server.create(typname, version, None, res_desc, **ctor_args) else: obj = server self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj, obj._token.address) return obj
def start_manager(self, index, authkey, address, files, allow_shell=False): """ Launch remote manager process via `ssh`. The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid removal of the temporary directory used on the host. index: int Index in parent cluster. authkey: string Authorization key used to connect to host server. address: (ip_addr, port) or string referring to pipe. Address to use to connect back to parent. files: list(string) Files to be sent to support server startup. allow_shell: bool If True, :meth:`execute_command` and :meth:`load_model` are allowed in created servers. Use with caution! """ try: self._check_ssh() except RuntimeError: self.state = 'failed' return self.tempdir = self._copy_to_remote(files) if not self.tempdir: self.state = 'failed' return _LOGGER.debug('startup files copied to %s:%s', self.hostname, self.tempdir) if self.tunnel_incoming: _LOGGER.debug('setup reverse tunnel from %s to %s:%s', self.hostname, address[0], address[1]) address, cleanup = \ setup_reverse_tunnel(self.hostname, address[0], address[1], identity=self.identity_filename) self.reverse_cleanup = cleanup cmd = self._ssh_cmd() cmd.extend([self.hostname, self.beforestart, self.python, '-c', '"import sys;' ' sys.path.append(\'.\');' ' import os;' ' os.chdir(\'%s\');' ' from mp_distributing import main;' ' main()"' % self.tempdir]) self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} # Tell the server what name to bind to # (in case it has multiple interfaces). user, remote_name = self.hostname.split('@') data = dict( name='BoostrappingHost', index=index, hostname=remote_name, # Avoid lots of SUBDEBUG messages. dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG), dir=self.tempdir, authkey=str(authkey), allowed_users=allowed_users, allow_shell=allow_shell, allow_tunneling=self.tunnel_outgoing, parent_address=address, registry=self.registry, keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0')) # Windows can't handle binary on stdin. dump = cPickle.dumps(data, cPickle.HIGHEST_PROTOCOL) dump = base64.b64encode(dump) _LOGGER.debug('sending %s config info (%s)', self.hostname, len(dump)) self.proc.stdin.write(dump) self.proc.stdin.close() time.sleep(1) # Give the proc time to register startup problems. self.poll() if self.state != 'failed': self.state = 'started'
def start_manager(self, index, authkey, address, files, allow_shell=False): """ Launch remote manager process via `ssh`. The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid removal of the temporary directory used on the host. index: int Index in parent cluster. authkey: string Authorization key used to connect to host server. address: (ip_addr, port) or string referring to pipe. Address to use to connect back to parent. files: list(string) Files to be sent to support server startup. allow_shell: bool If True, :meth:`execute_command` and :meth:`load_model` are allowed in created servers. Use with caution! """ try: _check_ssh(self.hostname) except Exception: self.state = 'failed' return self.tempdir = _copy_to_remote(self.hostname, files, self.python) _LOGGER.debug('startup files copied to %s:%s', self.hostname, self.tempdir) cmd = copy.copy(_SSH) cmd.extend([ self.hostname, self.python, '-c', '"import sys;' ' sys.path.append(\'.\');' ' import os;' ' os.chdir(\'%s\');' ' from mp_distributing import main;' ' main()"' % self.tempdir ]) self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) credentials = get_credentials() allowed_users = {credentials.user: credentials.public_key} data = dict( name='BoostrappingHost', index=index, # Avoid lots of SUBDEBUG messages. dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG), dir=self.tempdir, authkey=str(authkey), allowed_users=allowed_users, allow_shell=allow_shell, parent_address=address, registry=self.registry, keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0')) cPickle.dump(data, self.proc.stdin, cPickle.HIGHEST_PROTOCOL) self.proc.stdin.close() # TODO: put timeout in accept() to avoid this hack. time.sleep(1) # Give the proc time to register startup problems. self.poll() if self.state != 'failed': self.state = 'started'
def test_credentials(self): logging.debug('') logging.debug('test_credentials') # Basic form. owner = Credentials() if sys.platform == 'win32' and not HAVE_PYWIN32: self.assertEqual('%s' % owner, owner.user+' (transient)') else: self.assertEqual('%s' % owner, owner.user) # Comparison. user = Credentials() self.assertEqual(user, owner) user.user = '******' self.assertNotEqual(user, owner) self.assertNotEqual(user, 'xyzzy') # Thread storage. try: del threading.current_thread().credentials # Ensure empty. except AttributeError: pass self.assertEqual(get_credentials(), owner) # Sign/verify. encoded = owner.encode() Credentials.verify(encoded, allowed_users=None) # 'First sighting'. Credentials.verify(encoded, allowed_users=None) # Cached verification. data, signature, client_creds = encoded encoded = (data[:1], signature, client_creds) assert_raises(self, 'Credentials.verify(encoded, None)', globals(), locals(), CredentialsError, 'Invalid data') encoded = (data[:-1], signature, client_creds) assert_raises(self, 'Credentials.verify(encoded, None)', globals(), locals(), CredentialsError, 'Invalid signature') encoded = (data, signature[:-1], client_creds) assert_raises(self, 'Credentials.verify(encoded, None)', globals(), locals(), CredentialsError, 'Invalid signature') newline = data.find('\n') # .user newline = data.find('\n', newline+1) # .transient # Expecting '-' mangled = data[:newline+1] + '*' + data[newline+2:] encoded = (mangled, signature, client_creds) assert_raises(self, 'Credentials.verify(encoded, None)', globals(), locals(), CredentialsError, 'Invalid key') # Detect mismatched key. get_key_pair(owner.user, overwrite_cache=True) spook = Credentials() encoded = spook.encode() assert_raises(self, 'Credentials.verify(encoded, None)', globals(), locals(), CredentialsError, 'Public key mismatch') # Check if remote access. self.assertFalse(remote_access())