コード例 #1
0
    def max_servers(self, resource_desc):
        """
        Returns the total of :meth:`max_servers` across all
        :class:`LocalAllocator` in the cluster.

        resource_desc: dict
            Description of required resources.
        """
        credentials = get_credentials()

        key = 'allocator'
        value = resource_desc.get(key, '')
        if value:
            if self.name != value:
                return 0
            else:
                # Any host in our cluster is OK.
                resource_desc = resource_desc.copy()
                del resource_desc[key]

        with self._lock:
            # Drain _reply_q.
            while True:
                try:
                    self._reply_q.get_nowait()
                except Queue.Empty:
                    break

            # Get counts via worker threads.
            todo = []
            max_workers = 10
            for i, allocator in enumerate(self._allocators.values()):
                if i < max_workers:
                    worker_q = WorkerPool.get()
                    worker_q.put((self._get_count,
                                  (allocator, resource_desc, credentials),
                                  {}, self._reply_q))
                else:
                    todo.append(allocator)

            # Process counts.
            total = 0
            for i in range(len(self._allocators)):
                worker_q, retval, exc, trace = self._reply_q.get()
                if exc:
                    self._logger.error(trace)
                    raise exc

                try:
                    next_allocator = todo.pop(0)
                except IndexError:
                    WorkerPool.release(worker_q)
                else:
                    worker_q.put((self._get_count,
                                  (next_allocator, resource_desc, credentials),
                                  {}, self._reply_q))
                count = retval
                if count:
                    total += count
            return total
コード例 #2
0
    def test_access_controller(self):
        logging.debug('')
        logging.debug('test_access_controller')

        # Credential-to-role mapping.
        owner = get_credentials()
        controller = AccessController()
        self.assertEqual(controller.get_role(None), '')
        self.assertEqual(controller.get_role(owner), 'owner')
        user = Credentials()
        user.user = '******'
        self.assertEqual(controller.get_role(user), 'user')
        assert_raises(self, 'controller.get_role(object())', globals(), locals(),
                      TypeError, 'credentials is not a Credentials object')

        # Proxy role-to-credential mapping.
        obj = Object()
        assert_raises(self, 'controller.get_proxy_credentials(obj.no_rbac, user)',
                      globals(), locals(), RoleError, 'No RBAC for method')
        self.assertEqual(controller.get_proxy_credentials(obj.single_role, user),
                         user)
        self.assertEqual(controller.get_proxy_credentials(obj.proxy_role, user),
                         owner)
        assert_raises(self,
                      'controller.get_proxy_credentials(obj.proxy_other, user)',
                      globals(), locals(), RoleError,
                      'No credentials for proxy role other')
        assert_raises(self, "controller.set_proxy_credentials('other', object())",
                      globals(), locals(), TypeError,
                      'credentials is not a Credentials object')
        other = Credentials()
        other.user = '******'
        controller.set_proxy_credentials('other', other)
        self.assertEqual(controller.get_proxy_credentials(obj.proxy_other, user),
                         other)

        # Attribute access.
        controller.check_access('user', '__getattr__', obj, 'dummy')
        controller.check_access('owner', '__setattr__', obj, 'dummy')
        assert_raises(self,
                      "controller.check_access('user', '__delattr__', obj, 'dummy')",
                      globals(), locals(), RoleError,
                      "No __delattr__ access to 'dummy' by role 'user'")
        assert_raises(self,
                      "controller.check_access('', '__getattr__', obj, 'dummy')",
                      globals(), locals(), RoleError, 'No access by null role')

        # Attribute proxying.
        proxy_value = ProxyRequired()
        self.assertFalse(controller.need_proxy(obj, 'dummy', proxy_value))
        controller.attr_proxy_required(obj, 'dummy')
        self.assertTrue(controller.need_proxy(obj, 'dummy', proxy_value))
        controller.attr_proxy_required(obj, 'dummy', False)
        self.assertFalse(controller.need_proxy(obj, 'dummy', proxy_value))
        controller.class_proxy_required(ProxyRequired)
        self.assertTrue(controller.need_proxy(obj, 'dummy', proxy_value))
コード例 #3
0
 def cleanup(self):
     """ Shut-down all remaining :class:`ObjServers`. """
     self._logger.debug('cleanup')
     cleanup_creds = get_credentials()
     servers = self._managers.keys()
     for server in servers:
         # Cleanup overrides release() 'owner' protection.
         set_credentials(self._managers[server][2])
         try:
             self.release(server)
         finally:
             set_credentials(cleanup_creds)
     self._managers = {}
コード例 #4
0
    def test_rsh(self):
        logging.debug('')
        logging.debug('test_rsh')

        testdir = 'external_rsh'
        if os.path.exists(testdir):
            shutil.rmtree(testdir, onerror=onerror)
        os.mkdir(testdir)
        os.chdir(testdir)

        factory = None
        try:
            # Try to set command line on remote ExternalCode instance.
            typname = 'openmdao.lib.components.external_code.ExternalCode'
            factory = ObjServerFactory(allowed_types=[typname])
            exec_comp = factory.create(typname)
            try:
                exec_comp.command = ['this-should-fail']
            except RemoteError as exc:
                msg = "RoleError: No __setattr__ access to 'command'"
                logging.debug('msg: %s', msg)
                logging.debug('exc: %s', exc)
                self.assertTrue(msg in str(exc))
            else:
                self.fail('Expected RemoteError')

            exec_comp.set('command', ['this-should-pass'])

            # Try to set via remote-looking access.
            creds = get_credentials()
            creds.client_creds = Credentials()
            logging.debug('    using %s', creds)
            try:
                exec_comp.set('command', ['this-should-fail'])
            except RemoteError as exc:
                fragment = ": 'command' may not be set() remotely"
                if fragment not in str(exc):
                    self.fail('%s not in %s' % (fragment, exc))
            finally:
                creds.client_creds = None

        finally:
            if factory is not None:
                factory.cleanup()
            os.chdir('..')
            if sys.platform == 'win32':
                time.sleep(2)  # Wait for process shutdown.
            keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0'))
            if not keep_dirs:
                shutil.rmtree(testdir, onerror=onerror)
コード例 #5
0
    def test_rsh(self):
        logging.debug("")
        logging.debug("test_rsh")

        testdir = "external_rsh"
        if os.path.exists(testdir):
            shutil.rmtree(testdir)
        os.mkdir(testdir)
        os.chdir(testdir)

        factory = None
        try:
            # Try to set command line on remote ExternalCode instance.
            typname = "openmdao.lib.components.external_code.ExternalCode"
            factory = ObjServerFactory(allowed_types=[typname])
            exec_comp = factory.create(typname)
            cmd = exec_comp.command

            try:
                exec_comp.command = ["this-should-fail"]
            except RemoteError as exc:
                msg = "RoleError: No __setattr__ access to 'command'"
                logging.debug("msg: %s", msg)
                logging.debug("exc: %s", exc)
                self.assertTrue(msg in str(exc))
            else:
                self.fail("Expected RemoteError")

            exec_comp.set("command", ["this-should-pass"])

            # Try to set via remote-looking access.
            creds = get_credentials()
            creds.client_creds = Credentials()
            logging.debug("    using %s", creds)
            try:
                code = "exec_comp.set('command', ['this-should-fail'])"
                assert_raises(self, code, globals(), locals(), RuntimeError, ": 'command' may not be set() remotely")
            finally:
                creds.client_creds = None

        finally:
            if factory is not None:
                factory.cleanup()
            os.chdir("..")
            if sys.platform == "win32":
                time.sleep(2)  # Wait for process shutdown.
            keep_dirs = int(os.environ.get("OPENMDAO_KEEPDIRS", "0"))
            if not keep_dirs:
                shutil.rmtree(testdir)
コード例 #6
0
    def start_factory(self, port=None, allowed_users=None):
        """ Start each factory process in a unique directory. """
        global _SERVER_ID
        _SERVER_ID += 1

        server_dir = 'Factory_%d' % _SERVER_ID
        if os.path.exists(server_dir):
            shutil.rmtree(server_dir)
        os.mkdir(server_dir)
        os.chdir(server_dir)
        self.server_dirs.append(server_dir)
        try:
            logging.debug('')
            logging.debug('tester pid: %s', os.getpid())
            logging.debug('starting server...')

            if port is None:
                # Exercise both AF_INET and AF_UNIX/AF_PIPE.
                port = -1 if _SERVER_ID & 1 else 0

            if allowed_users is None:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}

            allowed_types = ['openmdao.main.test.test_distsim.HollowSphere',
                             'openmdao.main.test.test_distsim.Box',
                             'openmdao.main.test.test_distsim.ProtectedBox']

            server, server_cfg = start_server(port=port,
                                              allowed_users=allowed_users,
                                              allowed_types=allowed_types,
                                              log_prefix=server_dir)
            self.servers.append(server)
            cfg = read_server_config(server_cfg)
            self.address = cfg['address']
            self.port = cfg['port']
            self.tunnel = cfg['tunnel']
            self.key = cfg['key']
            logging.debug('server pid: %s', server.pid)
            logging.debug('server address: %s', self.address)
            logging.debug('server port: %s', self.port)
            logging.debug('server key: %s', self.key)
        finally:
            os.chdir('..')

        factory = connect(self.address, self.port, self.tunnel, pubkey=self.key)
        self.factories.append(factory)
        logging.debug('factory: %r', factory)
        return factory
コード例 #7
0
    def start_factory(self, port=None, allowed_users=None):
        """ Start each factory process in a unique directory. """
        global _SERVER_ID
        _SERVER_ID += 1

        server_dir = "Factory_%d" % _SERVER_ID
        if os.path.exists(server_dir):
            shutil.rmtree(server_dir)
        os.mkdir(server_dir)
        os.chdir(server_dir)
        self.server_dirs.append(server_dir)
        try:
            logging.debug("")
            logging.debug("tester pid: %s", os.getpid())
            logging.debug("starting server...")

            if port is None:
                # Exercise both AF_INET and AF_UNIX/AF_PIPE.
                port = -1 if _SERVER_ID & 1 else 0

            if allowed_users is None:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}

            allowed_types = [
                "openmdao.main.test.test_distsim.HollowSphere",
                "openmdao.main.test.test_distsim.Box",
                "openmdao.main.test.test_distsim.ProtectedBox",
            ]

            server, server_cfg = start_server(port=port, allowed_users=allowed_users, allowed_types=allowed_types)
            self.servers.append(server)
            cfg = read_server_config(server_cfg)
            self.address = cfg["address"]
            self.port = cfg["port"]
            self.tunnel = cfg["tunnel"]
            self.key = cfg["key"]
            logging.debug("server pid: %s", server.pid)
            logging.debug("server address: %s", self.address)
            logging.debug("server port: %s", self.port)
            logging.debug("server key: %s", self.key)
        finally:
            os.chdir("..")

        factory = connect(self.address, self.port, self.tunnel, pubkey=self.key)
        self.factories.append(factory)
        logging.debug("factory: %r", factory)
        return factory
コード例 #8
0
    def release(self, server):
        """
        Shut-down :class:`ObjServer` `server`.

        server: :class:`ObjServer`
            Server to be shut down.
        """
        try:
            address = server._token.address
        except AttributeError:
            address = 'not-a-proxy'
        self._logger.debug('release %r', server)
        self._logger.debug('        at %r', address)
        try:
            manager, root_dir, owner = self._managers[server]
        except KeyError:
            # Not identical to any of our proxies.
            # Could still be a reference to the same remote object.
            try:
                server_host = server.host
                server_pid = server.pid
            except Exception:
                self._logger.error("release: can't identify server at %r",
                                   address)
                raise ValueError("can't identify server at %r" % (address,))

            for key in self._managers.keys():
                if key.host == server_host and key.pid == server_pid:
                    manager, root_dir, owner = self._managers[key]
                    server = key
                    break
            else:
                self._logger.error('release: server %r not found', server)
                for key in self._managers.keys():
                    self._logger.debug('    %r', key)
                    self._logger.debug('    at %r', key._token.address)
                raise ValueError('server %r not found' % server)

        if get_credentials().user != owner.user:
            raise RoleError('only the owner can release')

        manager.shutdown()
        server._close.cancel()
        del self._managers[server]
        keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0'))
        if not keep_dirs and os.path.exists(root_dir):
            shutil.rmtree(root_dir)
コード例 #9
0
    def release(self, server):
        """
        Shut-down :class:`ObjServer` `server`.

        server: :class:`ObjServer`
            Server to be shut down.
        """
        try:
            address = server._token.address
        except AttributeError:
            address = 'not-a-proxy'
        self._logger.debug('release %r', server)
        self._logger.debug('        at %r', address)
        try:
            manager, root_dir, owner = self._managers[server]
        except KeyError:
            # Not identical to any of our proxies.
            # Could still be a reference to the same remote object.
            try:
                server_host = server.host
                server_pid = server.pid
            except Exception:
                self._logger.error("release: can't identify server at %r",
                                   address)
                raise ValueError("can't identify server at %r" % (address, ))

            for key in self._managers.keys():
                if key.host == server_host and key.pid == server_pid:
                    manager, root_dir, owner = self._managers[key]
                    server = key
                    break
            else:
                self._logger.error('release: server %r not found', server)
                for key in self._managers.keys():
                    self._logger.debug('    %r', key)
                    self._logger.debug('    at %r', key._token.address)
                raise ValueError('server %r not found' % server)

        if get_credentials().user != owner.user:
            raise RoleError('only the owner can release')

        manager.shutdown()
        server._close.cancel()
        del self._managers[server]
        keep_dirs = int(os.environ.get('OPENMDAO_KEEPDIRS', '0'))
        if not keep_dirs and os.path.exists(root_dir):
            shutil.rmtree(root_dir)
コード例 #10
0
    def load_model(self, egg_filename):
        """
        Load model from egg and return top-level object if this server's
        `allow_shell` attribute is True.

        egg_filename: string
            Filename of egg to be loaded.
        """
        self._logger.debug('load_model %r', egg_filename)
        if not self._allow_shell:
            self._logger.error('attempt to load %r by %r', egg_filename,
                               get_credentials().user)
            raise RuntimeError('shell access is not allowed by this server')
        self._check_path(egg_filename, 'load_model')
        if self.tlo:
            self.tlo.pre_delete()
        self.tlo = Container.load_from_eggfile(egg_filename, log=self._logger)
        return self.tlo
コード例 #11
0
    def load_model(self, egg_filename):
        """
        Load model from egg and return top-level object if this server's
        `allow_shell` attribute is True.

        egg_filename: string
            Filename of egg to be loaded.
        """
        self._logger.debug('load_model %r', egg_filename)
        if not self._allow_shell:
            self._logger.error('attempt to load %r by %r', egg_filename,
                               get_credentials().user)
            raise RuntimeError('shell access is not allowed by this server')
        self._check_path(egg_filename, 'load_model')
        if self.tlo:
            self.tlo.pre_delete()
        self.tlo = Container.load_from_eggfile(egg_filename, log=self._logger)
        return self.tlo
コード例 #12
0
ファイル: mpiallocator.py プロジェクト: fzahle/MPI_allocator
    def deploy(self,name, resource_desc,criteria):
        """ 
        Deploy a server suitable for `resource_desc`.
        Returns a proxy to the deployed server.

        name: string
            Name for server.

        resource_desc: dict
            Description of required resources.

        criteria: dict
            The dictionary returned by :meth:`time_estimate`.
        """

        hostnames = []
        n_cpus=resource_desc['min_cpus']
        nh = 0
        for i,worker in enumerate(self.workers):
            if nh == n_cpus: break
            if worker['state'] == 1:
                worker['state'] = 0
                hostnames.append(worker['hostname'])
                nh+=1
        print 'allocating hosts',hostnames


        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}
        try:
            server = self.factory.create(typname='', allowed_users=allowed_users,
                                       name=name)

            # overwrite the server's host list with the assigned hosts
            server.host = hostnames[0]
            server.mpi_resources = hostnames
            return server

        # Shouldn't happen...
        except Exception as exc:  #pragma no cover
            self._logger.error('create failed: %r', exc)
            return None
コード例 #13
0
    def execute_command(self, command, stdin, stdout, stderr, env_vars,
                        poll_delay, timeout):
        """
        Run `command` in a subprocess if this server's `allow_shell`
        attribute is True.

        command: string
            Command line to be executed.

        stdin, stdout, stderr: string
            Filenames for the corresponding stream.

        env_vars: dict
            Environment variables for the command.

        poll_delay: float (seconds)
            Delay between polling subprocess for completion.

        timeout: float (seconds)
            Maximum time to wait for command completion. A value of zero
            implies no timeout.
        """
        self._logger.debug('execute_command %r', command)
        if not self._allow_shell:
            self._logger.error('attempt to execute %r by %r', command,
                               get_credentials().user)
            raise RuntimeError('shell access is not allowed by this server')

        for arg in (stdin, stdout, stderr):
            if isinstance(arg, basestring):
                self._check_path(arg, 'execute_command')
        try:
            process = ShellProc(command, stdin, stdout, stderr, env_vars)
        except Exception as exc:
            self._logger.error('exception creating process: %s', exc)
            raise

        self._logger.debug('    PID = %d', process.pid)
        return_code, error_msg = process.wait(poll_delay, timeout)
        self._logger.debug('    returning %s', (return_code, error_msg))
        return (return_code, error_msg)
コード例 #14
0
    def deploy(self, name, resource_desc, criteria):
        """
        Deploy a server suitable for `resource_desc`.
        Returns a proxy to the deployed server.

        name: string
            Name for server.

        resource_desc: dict
            Description of required resources.

        criteria: dict
            The dictionary returned by :meth:`time_estimate`.
        """
        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}
        try:
            return self.create(typname="", allowed_users=allowed_users, name=name)
        # Shouldn't happen...
        except Exception as exc:  # pragma no cover
            self._logger.error("create failed: %r", exc)
            return None
コード例 #15
0
    def deploy(self, name, resource_desc, criteria):
        """
        Deploy a server suitable for `resource_desc`.
        Returns a proxy to the deployed server.

        name: string
            Name for server.

        resource_desc: dict
            Description of required resources.

        criteria: dict
            The dictionary returned by :meth:`time_estimate`.
        """
        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}
        try:
            return self.create(typname='', allowed_users=allowed_users,
                               name=name)
        # Shouldn't happen...
        except Exception as exc:  #pragma no cover
            self._logger.error('create failed: %r', exc)
            return None
コード例 #16
0
    def start_manager(self, index, authkey, address, files, allow_shell=False):
        """
        Launch remote manager process via `ssh`.
        The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid
        removal of the temporary directory used on the host.

        index: int
            Index in parent cluster.

        authkey: string
            Authorization key used to connect to host server.

        address: (ip_addr, port) or string referring to pipe.
            Address to use to connect back to parent.

        files: list(string)
            Files to be sent to support server startup.

        allow_shell: bool
            If True, :meth:`execute_command` and :meth:`load_model` are allowed
            in created servers. Use with caution!
        """
        try:
            self._check_ssh()
        except RuntimeError:
            self.state = 'failed'
            return

        self.tempdir = self._copy_to_remote(files)
        if not self.tempdir:
            self.state = 'failed'
            return
        _LOGGER.debug('startup files copied to %s:%s', self.hostname,
                      self.tempdir)

        if self.tunnel_incoming:
            _LOGGER.debug('setup reverse tunnel from %s to %s:%s',
                          self.hostname, address[0], address[1])
            address, cleanup = \
                setup_reverse_tunnel(self.hostname, address[0], address[1],
                                     identity=self.identity_filename)
            self.reverse_cleanup = cleanup

        cmd = self._ssh_cmd()
        cmd.extend([
            self.hostname, self.python, '-c',
            '"import sys;'
            ' sys.path.append(\'.\');'
            ' import os;'
            ' os.chdir(\'%s\');'
            ' from mp_distributing import main;'
            ' main()"' % self.tempdir
        ])
        self.proc = subprocess.Popen(cmd,
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}

        # Tell the server what name to bind to
        # (in case it has multiple interfaces).
        user, remote_name = self.hostname.split('@')

        data = dict(
            name='BoostrappingHost',
            index=index,
            hostname=remote_name,
            # Avoid lots of SUBDEBUG messages.
            dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG),
            dir=self.tempdir,
            authkey=str(authkey),
            allowed_users=allowed_users,
            allow_shell=allow_shell,
            allow_tunneling=self.tunnel_outgoing,
            parent_address=address,
            registry=self.registry,
            keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0'))

        # Windows can't handle binary on stdin.
        dump = cPickle.dumps(data, cPickle.HIGHEST_PROTOCOL)
        dump = base64.b64encode(dump)
        _LOGGER.debug('sending %s config info (%s)', self.hostname, len(dump))
        self.proc.stdin.write(dump)
        self.proc.stdin.close()
        time.sleep(1)  # Give the proc time to register startup problems.
        self.poll()
        if self.state != 'failed':
            self.state = 'started'
コード例 #17
0
            logging.debug('exc: %s', exc)
            self.assertTrue(msg in str(exc))
        else:
            self.fail('Expected RemoteError')

        try:
            model.box.proprietary_method()
        except RemoteError as exc:
            msg = "RoleError: proprietary_method(): No access for role 'user'"
            logging.debug('msg: %s', msg)
            logging.debug('exc: %s', exc)
            self.assertTrue(msg in str(exc))
        else:
            self.fail('Expected RemoteError')

        saved = get_credentials()
        set_credentials(spook)
        try:
            i = model.box.secret
            model.box.proprietary_method()
        finally:
            # Reset credentials to allow factory shutdown.
            set_credentials(saved)

    def test_4_authkey(self):
        logging.debug('')
        logging.debug('test_authkey')

        factory = self.start_factory()

        # Start server in non-public-key mode.
コード例 #18
0
    def test_3_access(self):
        logging.debug('')
        logging.debug('test_access')

        # This 'spook' creation is only for testing.
        # Normally the protector would run with regular credentials
        # in effect at the proprietary site.
        user = '******'+socket.gethostname()
        key_pair = get_key_pair(user)
        data = '\n'.join([user, '0', key_pair.publickey().exportKey()])
        hash = hashlib.sha256(data).digest()
        signature = key_pair.sign(hash, get_random_bytes)
        spook = Credentials((data, signature, None))

        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key,
                         spook.user: spook.public_key}
        factory = self.start_factory(allowed_users=allowed_users)

        # Create model and run it.
        saved = get_credentials()
        set_credentials(spook)
        box = factory.create(_MODULE+'.ProtectedBox',
                             allowed_users=allowed_users)
        set_credentials(saved)

        model = set_as_top(Model(box))
        model.run()

        # Check results.
        for width in range(1, 2):
            for height in range(1, 3):
                for depth in range(1, 4):
                    case = model.driver.recorder.cases.pop(0)
                    self.assertEqual(case.outputs[0][2], width*height*depth)

        # Check access protections.
        try:
            i = model.box.secret
        except RemoteError as exc:
            msg = "RoleError: No __getattribute__ access to 'secret' by role 'user'"
            logging.debug('msg: %s', msg)
            logging.debug('exc: %s', exc)
            self.assertTrue(msg in str(exc))
        else:
            self.fail('Expected RemoteError')

        try:
            model.box.proprietary_method()
        except RemoteError as exc:
            msg = "RoleError: proprietary_method(): No access for role 'user'"
            logging.debug('msg: %s', msg)
            logging.debug('exc: %s', exc)
            self.assertTrue(msg in str(exc))
        else:
            self.fail('Expected RemoteError')

        saved = get_credentials()
        set_credentials(spook)
        try:
            i = model.box.secret
            model.box.proprietary_method()
        finally:
            # Reset credentials to allow factory shutdown.
            set_credentials(saved)
コード例 #19
0
    def create(self,
               typname,
               version=None,
               server=None,
               res_desc=None,
               **ctor_args):
        """
        Create a new `typname` object in `server` or a new
        :class:`ObjectServer`.  Returns a proxy for for the new object.
        Starts servers in a subdirectory of the current directory.

        typname: string
            Type of object to create. If null, then a proxy for the new
            :class:`ObjServer` is returned.

        version: string or None
            Version of `typname` to create.

        server: proxy
            :class:`ObjServer` on which to create `typname`.
            If none, then a new server is created.

        res_desc: dict or None
            Required resources. Currently not used.

        ctor_args: dict
            Other constructor arguments.
            If `name` or `allowed_users` are specified, they are used when
            creating the :class:`ObjServer`. If no `allowed_users` are
            specified, the server is private to the current user.
        """
        self._logger.info(
            'create typname %r, version %r server %s,'
            ' res_desc %s, args %s', typname, version, server, res_desc,
            ctor_args)

        if server is None:
            name = ctor_args.get('name', '')
            if not name:
                name = 'Server_%d' % (len(self._managers) + 1)

            allowed_users = ctor_args.get('allowed_users')
            if not allowed_users:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}
            else:
                del ctor_args['allowed_users']

            if self._address is None or \
               isinstance(self._address, basestring) or \
               self._allow_tunneling:
                # Local access only via pipe if factory accessed by pipe
                # or factory is accessed via tunnel.
                address = None
            else:
                # Network access via same IP as factory, system-selected port.
                address = (self._address[0], 0)

            manager = self.manager_class(address,
                                         self._authkey,
                                         name=name,
                                         allowed_users=allowed_users)
            root_dir = name
            count = 1
            while os.path.exists(root_dir):
                count += 1
                root_dir = '%s_%d' % (name, count)
            os.mkdir(root_dir)

            # On Windows, when running the full test suite under Nose,
            # starting the process starts a new Nose test session, which
            # will eventually get here and start a new Nose session, which...
            orig_main = None
            if sys.platform == 'win32':  #pragma no cover
                scripts = ('openmdao-script.py', 'openmdao_test-script.py')
                try:
                    main_file = sys.modules['__main__'].__file__
                except AttributeError:
                    pass
                else:
                    if main_file.endswith(scripts):
                        orig_main = main_file
                        sys.modules['__main__'].__file__ = \
                            pkg_resources.resource_filename('openmdao.main',
                                                            'objserverfactory.py')
            owner = get_credentials()
            self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s',
                             owner, name, root_dir)
            try:
                manager.start(cwd=root_dir,
                              log_level=self._logger.getEffectiveLevel())
            finally:
                if orig_main is not None:  #pragma no cover
                    sys.modules['__main__'].__file__ = orig_main

            self._logger.info('new server %r for %s', name, owner)
            self._logger.info('    in dir %s', root_dir)
            self._logger.info('    listening on %s', manager.address)
            server_class = getattr(manager, self.server_classname)
            server = server_class(name=name,
                                  allow_shell=self._allow_shell,
                                  allowed_types=self._allowed_types)
            self._managers[server] = (manager, root_dir, owner)

        if typname:
            obj = server.create(typname, version, None, res_desc, **ctor_args)
        else:
            obj = server

        self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj,
                         obj._token.address)
        return obj
コード例 #20
0
    def start(self):
        """ Start this manager and all remote managers. """
        super(Cluster, self).start()
        hostname = socket.getfqdn()
        listener = connection.Listener(address=(hostname, 0),
                                       authkey=self._authkey,
                                       backlog=5)  # Default is 1.
        # TODO: support multiple addresses if multiple networks are attached.

        # Start managers in separate thread to avoid losing connections.
        starter = threading.Thread(target=self._start_hosts,
                                   args=(listener.address, get_credentials()))
        starter.daemon = True
        starter.start()

        # Accept callback connections from started managers.
        waiting = ['']
        retry = 0
        while waiting:
            host_processed = False
            for host in self._hostlist:
                host.poll()
                if host.state == 'started':
                    # Accept conection from *any* host.
                    _LOGGER.debug('waiting for a connection, host %s',
                                  host.hostname)
                    # This will hang if server doesn't receive our address.
                    conn = listener.accept()
                    i, address, pubkey_text = conn.recv()
                    conn.close()
                    other_host = self._hostlist[i]
                    if address is None:
                        _LOGGER.error('Host %s died: %s', other_host.hostname,
                                      pubkey_text)  # Exception text.
                        continue

                    other_host.manager = HostManager.from_address(
                        address, self._authkey)
                    other_host.state = 'up'
                    if pubkey_text:
                        other_host.manager._pubkey = \
                            decode_public_key(pubkey_text)
                    host_processed = True
                    _LOGGER.debug('Host %s is now up', other_host.hostname)
                    self._up.append(other_host)

            # See if there are still hosts to wait for.
            waiting = []
            for host in self._hostlist:
                host.poll()
                if host.state == 'init' or host.state == 'started':
                    waiting.append(host)
            if waiting:
                if not host_processed:
                    retry += 1
                    if retry < 600:  # ~60 seconds.
                        time.sleep(0.1)
                    else:
                        _LOGGER.warning('Cluster startup timeout,'
                                        ' hosts not started:')
                        for host in waiting:
                            _LOGGER.warning('    %s (%s) in dir %s',
                                            host.hostname, host.state,
                                            host.tempdir)
                        break
            else:
                break

        self._up = sorted(self._up, key=lambda host: host.hostname)

        self._base_shutdown = self.shutdown
        del self.shutdown
コード例 #21
0
    def execute_command(self, resource_desc):
        """
        Run command described by `resource_desc` in a subprocess if this
        server's `allow_shell` attribute is True.

        resource_desc: dict
            Contains job description.

        The current environment, along with any 'job_environment' specification,
        is in effect while running 'remote_command'.

        If 'input_path' is not specified, ``/dev/null`` or ``nul:`` is used.
        If 'output_path' is not specified, ``<remote_command>.stdout`` is used.
        If neither 'error_path' nor 'join_files' are specified,
        ``<remote_command>.stderr`` is used.

        If specified in the 'resource_limits' dictionary, 'wallclock_time' is
        used as a timeout.

        All other queuing resource keys are ignored.

        The ``HOME_DIRECTORY`` and ``WORKING_DIRECTORY`` placeholders are
        ignored.
        """
        try:
            job_name = resource_desc['job_name']
        except KeyError:
            job_name = ''

        command = resource_desc['remote_command']
        self._check_path(command, 'execute_command')
        base = os.path.basename(command)
        command = [command]
        if 'args' in resource_desc:
            command.extend(resource_desc['args'])

        self._logger.debug('execute_command %s %r', job_name, command)
        if not self._allow_shell:
            self._logger.error('attempt to execute %r by %r', command,
                               get_credentials().user)
            raise RuntimeError('shell access is not allowed by this server')

        env_vars = resource_desc.get('job_environment')

        try:
            stdin = resource_desc['input_path']
            self._check_path(stdin, 'execute_command')
        except KeyError:
            stdin = DEV_NULL

        try:
            stdout = resource_desc['output_path']
            self._check_path(stdout, 'execute_command')
        except KeyError:
            stdout = base+'.stdout'

        try:
            stderr = resource_desc['error_path']
            self._check_path(stderr, 'execute_command')
        except KeyError:
            try:
                join_files = resource_desc['join_files']
            except KeyError:
                stderr = base+'.stderr'
            else:
                stderr = STDOUT if join_files else base+'.stderr'

        limits = resource_desc.get('resource_limits', {})
        timeout = limits.get('wallclock_time', 0)
        poll_delay = 1

        try:
            process = ShellProc(command, stdin, stdout, stderr, env_vars)
        except Exception as exc:
            self._logger.error('exception creating process: %s', exc)
            raise

        self._logger.debug('    PID = %d', process.pid)
        return_code, error_msg = process.wait(poll_delay, timeout)
        self._logger.debug('    returning %s', (return_code, error_msg))
        return (return_code, error_msg)
コード例 #22
0
    def start(self):
        """
        Start this manager and all remote managers. If some managers fail to
        start, errors are logged and the corresponding host's state is set to
        ``failed``. You can use ``len(cluster)`` to determine how many remote
        managers are available.

        A :class:`RuntimeError` will be raised if no managers were successfully
        started.
        """
        super(Cluster, self).start()
        listener = connection.Listener(address=(self._hostname, 0),
                                       authkey=self._authkey,
                                       backlog=5)  # Default is 1.

        # Start managers in separate thread to avoid losing connections.
        starter = threading.Thread(target=self._start_hosts,
                                   args=(listener.address, get_credentials()))
        starter.daemon = True
        starter.start()

        # Accept callback connections from started managers.
        waiting = ['']
        retry = 0
        while waiting:
            host_processed = False
            for host in self._hostlist:
                host.poll()
                if host.state == 'started':
                    # Accept conection from *any* host.
                    _LOGGER.debug('waiting for a connection, host %s',
                                  host.hostname)
                    # Normal accept() can hang.
                    retval = []
                    accepter = threading.Thread(target=self._accept,
                                                args=(listener, retval),
                                                name='ClusterAccepter')
                    accepter.daemon = True
                    accepter.start()
                    accepter.join(30)
                    if accepter.is_alive():
                        msg = 'timeout waiting for reply from %s' \
                              % [host.hostname for host in self._hostlist
                                               if host.state == 'started']
                        _LOGGER.error(msg)
                        for host in self._hostlist:
                            if host.state == 'started':
                                if host.proc is not None:
                                    host.proc.terminate()
                                if host.reverse_cleanup is not None:
                                    host.reverse_cleanup[0](*host.reverse_cleanup[1:])
                                host.state = 'failed'
                        continue

                    conn = retval[0]
                    i, address, pubkey_text = conn.recv()
                    conn.close()

                    other_host = self._hostlist[i]
                    if address is None:
                        _LOGGER.error('Host %s died: %s', other_host.hostname,
                                      pubkey_text)  # Exception text.
                        other_host.state = 'failed'
                        continue
                    try:
                        other_host.manager = \
                            HostManager.from_address(address, self._authkey,
                                                     other_host)
                    except Exception as exc:
                        _LOGGER.error("Can't start manager for %s: %s",
                                      other_host.hostname, str(exc) or repr(exc))
                        if other_host.proc is not None:
                            other_host.proc.terminate()
                        other_host.state = 'failed'
                        continue
                    else:
                        other_host.state = 'up'
                        if pubkey_text:
                            other_host.manager._pubkey = \
                                decode_public_key(pubkey_text)
                        host_processed = True
                        _LOGGER.debug('Host %s is now up', other_host.hostname)
                        self._up.append(other_host)

            # See if there are still hosts to wait for.
            waiting = []
            for host in self._hostlist:
                host.poll()
                if host.state == 'init' or host.state == 'started':
                    waiting.append(host)
            if waiting:
                if not host_processed:
                    retry += 1
                    if retry < 300:  # ~60 seconds.
                        time.sleep(0.2)
                    else:
                        _LOGGER.warning('Cluster startup timeout,'
                                        ' hosts not started:')
                        for host in waiting:
                            _LOGGER.warning('    %s (%s) in dir %s',
                                            host.hostname, host.state,
                                            host.tempdir)
                        break
            else:
                break

        self._up = sorted(self._up, key=lambda host: host.hostname)

        # So our class defined shutdown() is called before the superclass
        # installed shutdown().
        self._base_shutdown = self.shutdown
        del self.shutdown

        if len(self._up) < 1:
            raise RuntimeError('No hosts successfully started')
コード例 #23
0
    def time_estimate(self, resource_desc):
        """
        Returns ``(estimate, criteria)`` indicating how well this allocator
        can satisfy the `resource_desc` request.  The estimate will be:

        - >0 for an estimate of walltime (seconds).
        -  0 for no estimate.
        - -1 for no resource at this time.
        - -2 for no support for `resource_desc`.

        The returned criteria is a dictionary containing information related
        to the estimate, such as hostnames, load averages, unsupported
        resources, etc.

        This allocator polls each :class:`LocalAllocator` in the cluster
        to find the best match and returns that.  The best allocator is saved
        in the returned criteria for a subsequent :meth:`deploy`.

        resource_desc: dict
            Description of required resources.
        """
        credentials = get_credentials()

        key = "allocator"
        value = resource_desc.get(key, "")
        if value:
            if self.name != value:
                return (-2, {key: value})
            else:
                # Any host in our cluster is OK.
                resource_desc = resource_desc.copy()
                del resource_desc[key]

        n_cpus = resource_desc.get("n_cpus", 0)
        if n_cpus:
            # Spread across LocalAllocators.
            resource_desc = resource_desc.copy()
            resource_desc["n_cpus"] = 1

        with self._lock:
            best_estimate = -2
            best_criteria = None
            best_allocator = None

            # Prefer not to repeat use of just-used allocator.
            prev_estimate = -2
            prev_criteria = None
            prev_allocator = self._last_deployed
            self._last_deployed = None

            # Drain _reply_q.
            while True:
                try:
                    self._reply_q.get_nowait()
                except Queue.Empty:
                    break

            # Get estimates via worker threads.
            todo = []
            max_workers = 10
            for i, allocator in enumerate(self._allocators.values()):
                if i < max_workers:
                    worker_q = WorkerPool.get()
                    worker_q.put((self._get_estimate, (allocator, resource_desc, credentials), {}, self._reply_q))
                else:
                    todo.append(allocator)

            # Process estimates.
            host_loads = []  # Sorted list of (hostname, load)
            for i in range(len(self._allocators)):
                worker_q, retval, exc, trace = self._reply_q.get()
                if exc:
                    self._logger.error(trace)
                    retval = None

                try:
                    next_allocator = todo.pop(0)
                except IndexError:
                    WorkerPool.release(worker_q)
                else:
                    worker_q.put((self._get_estimate, (next_allocator, resource_desc, credentials), {}, self._reply_q))

                if retval is None:
                    continue
                allocator, estimate, criteria = retval
                if estimate is None:
                    continue

                # Update loads.
                if estimate >= 0 and n_cpus:
                    load = criteria["loadavgs"][0]
                    new_info = (criteria["hostnames"][0], load)
                    if host_loads:
                        for i, info in enumerate(host_loads):
                            if load < info[1]:
                                host_loads.insert(i, new_info)
                                break
                        else:
                            host_loads.append(new_info)
                    else:
                        host_loads.append(new_info)

                # Update best estimate.
                if allocator is prev_allocator:
                    prev_estimate = estimate
                    prev_criteria = criteria
                elif (best_estimate <= 0 and estimate > best_estimate) or (
                    best_estimate > 0 and estimate < best_estimate
                ):
                    best_estimate = estimate
                    best_criteria = criteria
                    best_allocator = allocator
                elif best_estimate == 0 and estimate == 0:
                    best_load = best_criteria["loadavgs"][0]
                    load = criteria["loadavgs"][0]
                    if load < best_load:
                        best_estimate = estimate
                        best_criteria = criteria
                        best_allocator = allocator

            # If no alternative, repeat use of previous allocator.
            if best_estimate < 0 and prev_estimate >= 0:
                best_estimate = prev_estimate
                best_criteria = prev_criteria
                best_allocator = prev_allocator

            # Save best allocator in criteria in case we're asked to deploy.
            if best_criteria is not None:
                best_criteria["allocator"] = best_allocator

                # Save n_cpus hostnames in criteria.
                best_criteria["hostnames"] = [host_loads[i][0] for i in range(min(n_cpus, len(host_loads)))]

            return (best_estimate, best_criteria)
コード例 #24
0
    def execute_command(self, resource_desc):
        """
        Run command described by `resource_desc` in a subprocess if this
        server's `allow_shell` attribute is True.

        resource_desc: dict
            Contains job description.

        The current environment, along with any 'job_environment' specification,
        is in effect while running 'remote_command'.

        If 'input_path' is not specified, ``/dev/null`` or ``nul:`` is used.
        If 'output_path' is not specified, ``<remote_command>.stdout`` is used.
        If neither 'error_path' nor 'join_files' are specified,
        ``<remote_command>.stderr`` is used.

        If specified in the 'resource_limits' dictionary, 'wallclock_time' is
        used as a timeout.

        All other queuing resource keys are ignored.

        The ``HOME_DIRECTORY`` and ``WORKING_DIRECTORY`` placeholders are
        ignored.
        """
        try:
            job_name = resource_desc['job_name']
        except KeyError:
            job_name = ''

        command = resource_desc['remote_command']
        self._check_path(command, 'execute_command')
        base = os.path.basename(command)
        command = [command]
        if 'args' in resource_desc:
            command.extend(resource_desc['args'])

        self._logger.debug('execute_command %s %r', job_name, command)
        if not self._allow_shell:
            self._logger.error('attempt to execute %r by %r', command,
                               get_credentials().user)
            raise RuntimeError('shell access is not allowed by this server')

        env_vars = resource_desc.get('job_environment')

        try:
            stdin = resource_desc['input_path']
            self._check_path(stdin, 'execute_command')
        except KeyError:
            stdin = DEV_NULL

        try:
            stdout = resource_desc['output_path']
            self._check_path(stdout, 'execute_command')
        except KeyError:
            stdout = base + '.stdout'

        try:
            stderr = resource_desc['error_path']
            self._check_path(stderr, 'execute_command')
        except KeyError:
            try:
                join_files = resource_desc['join_files']
            except KeyError:
                stderr = base + '.stderr'
            else:
                stderr = STDOUT if join_files else base + '.stderr'

        limits = resource_desc.get('resource_limits', {})
        timeout = limits.get('wallclock_time', 0)
        poll_delay = 1

        try:
            process = ShellProc(command, stdin, stdout, stderr, env_vars)
        except Exception as exc:
            self._logger.error('exception creating process: %s', exc)
            raise

        self._logger.debug('    PID = %d', process.pid)
        return_code, error_msg = process.wait(poll_delay, timeout)
        self._logger.debug('    returning %s', (return_code, error_msg))
        return (return_code, error_msg)
コード例 #25
0
    def create(self,
               typname,
               version=None,
               server=None,
               res_desc=None,
               **ctor_args):
        """
        Create a new `typname` object in `server` or a new
        :class:`ObjectServer`.  Returns a proxy for for the new object.
        Starts servers in a subdirectory of the current directory.

        typname: string
            Type of object to create. If null, then a proxy for the new
            :class:`ObjServer` is returned.

        version: string or None
            Version of `typname` to create.

        server: proxy
            :class:`ObjServer` on which to create `typname`.
            If none, then a new server is created.

        res_desc: dict or None
            Required resources. ``working_directory`` is used to set a
            created server's directory, other keys are ignored.
            If `allow_shell` has been set, then an absolute directory
            reference may be used (including '~' expansion). If not, then
            the reference must be relative and the working directory will be
            relative to the factory's directory. If the directory already
            exists, a new name will be used of the form ``<directory>_N``

        ctor_args: dict
            Other constructor arguments.
            If `name` or `allowed_users` are specified, they are used when
            creating the :class:`ObjServer`. If no `allowed_users` are
            specified, the server is private to the current user.
        """
        self._logger.info(
            'create typname %r, version %r server %s,'
            ' res_desc %s, args %s', typname, version, server, res_desc,
            ctor_args)

        if server is None:
            name = ctor_args.get('name', '')
            if not name:
                name = 'Server_%d' % (len(self._managers) + 1)

            allowed_users = ctor_args.get('allowed_users')
            if not allowed_users:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}
            else:
                del ctor_args['allowed_users']

            if self._address is None or \
                    isinstance(self._address, basestring) or \
                    self._allow_tunneling:
                # Local access only via pipe if factory accessed by pipe
                # or factory is accessed via tunnel.
                address = None
            else:
                # Network access via same IP as factory, system-selected port.
                address = (self._address[0], 0)

            manager = self.manager_class(address,
                                         self._authkey,
                                         name=name,
                                         allowed_users=allowed_users)

            # Set (unique) working directory of server.
            # Server cleanup removes this directory, so we avoid any
            # existing directory to not delete existing files.
            base = None
            if res_desc is not None:
                base = res_desc.get('working_directory')
                if base:
                    if self._allow_shell:  # Absolute allowed.
                        base = os.path.expanduser(base)
                    elif os.path.isabs(base) or base.startswith('..'):
                        raise ValueError(
                            'working_directory %r must be subdirectory' % base)
                    res_desc = res_desc.copy()
                    del res_desc['working_directory']
            if not base:
                base = name
            count = 1
            root_dir = base
            while os.path.exists(root_dir):
                count += 1
                root_dir = '%s_%d' % (base, count)
            os.mkdir(root_dir)

            # On Windows, when running the full test suite under Nose,
            # starting the process starts a new Nose test session, which
            # will eventually get here and start a new Nose session, which...
            orig_main = None
            if sys.platform == 'win32':  # pragma no cover
                scripts = ('openmdao-script.py', 'openmdao_test-script.py')
                try:
                    main_file = sys.modules['__main__'].__file__
                except AttributeError:
                    pass
                else:
                    if main_file.endswith(scripts):
                        orig_main = main_file
                        sys.modules['__main__'].__file__ = \
                            pkg_resources.resource_filename('openmdao.main',
                                                            'objserverfactory.py')
            owner = get_credentials()
            self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s',
                             owner, name, root_dir)
            try:
                manager.start(cwd=root_dir,
                              log_level=self._logger.getEffectiveLevel())
            finally:
                if orig_main is not None:  # pragma no cover
                    sys.modules['__main__'].__file__ = orig_main

            self._logger.info('new server %r for %s', name, owner)
            self._logger.info('    in dir %s', root_dir)
            self._logger.info('    listening on %s', manager.address)
            server_class = getattr(manager, self.server_classname)
            server = server_class(name=name,
                                  allow_shell=self._allow_shell,
                                  allowed_types=self._allowed_types)
            self._managers[server] = (manager, root_dir, owner)

        if typname:
            obj = server.create(typname, version, None, res_desc, **ctor_args)
        else:
            obj = server

        self._logger.log(LOG_DEBUG2, 'create returning %r at %r', obj,
                         obj._token.address)
        return obj
コード例 #26
0
    def start_manager(self, index, authkey, address, files, allow_shell=False):
        """
        Launch remote manager process via `ssh`.
        The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid
        removal of the temporary directory used on the host.

        index: int
            Index in parent cluster.

        authkey: string
            Authorization key used to connect to host server.

        address: (ip_addr, port) or string referring to pipe.
            Address to use to connect back to parent.

        files: list(string)
            Files to be sent to support server startup.

        allow_shell: bool
            If True, :meth:`execute_command` and :meth:`load_model` are allowed
            in created servers. Use with caution!
        """
        try:
            _check_ssh(self.hostname)
        except Exception:
            self.state = 'failed'
            return

        self.tempdir = _copy_to_remote(self.hostname, files, self.python)
        _LOGGER.debug('startup files copied to %s:%s',
                      self.hostname, self.tempdir)
        cmd = copy.copy(_SSH)
        cmd.extend([self.hostname, self.python, '-c',
                   '"import sys;'
                   ' sys.path.append(\'.\');'
                   ' import os;'
                   ' os.chdir(\'%s\');'
                   ' from mp_distributing import main;'
                   ' main()"' % self.tempdir])
        self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}

        data = dict(
            name='BoostrappingHost', index=index,
            # Avoid lots of SUBDEBUG messages.
            dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG),
            dir=self.tempdir, authkey=str(authkey),
            allowed_users=allowed_users, allow_shell=allow_shell,
            parent_address=address, registry=self.registry,
            keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0')
            )
        cPickle.dump(data, self.proc.stdin, cPickle.HIGHEST_PROTOCOL)
        self.proc.stdin.close()
# TODO: put timeout in accept() to avoid this hack.
        time.sleep(1)  # Give the proc time to register startup problems.
        self.poll()
        if self.state != 'failed':
            self.state = 'started'
コード例 #27
0
    def start(self):
        """
        Start this manager and all remote managers. If some managers fail to
        start, errors are logged and the corresponding host's state is set to
        ``failed``. You can use ``len(cluster)`` to determine how many remote
        managers are available.

        A :class:`RuntimeError` will be raised if no managers were successfully
        started.
        """
        super(Cluster, self).start()
        listener = connection.Listener(address=(self._hostname, 0),
                                       authkey=self._authkey,
                                       backlog=5)  # Default is 1.

        # Start managers in separate thread to avoid losing connections.
        starter = threading.Thread(target=self._start_hosts,
                                   args=(listener.address, get_credentials()))
        starter.daemon = True
        starter.start()

        # Accept callback connections from started managers.
        waiting = ['']
        retry = 0
        while waiting:
            host_processed = False
            for host in self._hostlist:
                host.poll()
                if host.state == 'started':
                    # Accept conection from *any* host.
                    _LOGGER.debug('waiting for a connection, host %s',
                                  host.hostname)
                    # Normal accept() can hang.
                    retval = []
                    accepter = threading.Thread(target=self._accept,
                                                args=(listener, retval),
                                                name='ClusterAccepter')
                    accepter.daemon = True
                    accepter.start()
                    accepter.join(30)
                    if accepter.is_alive():
                        msg = 'timeout waiting for reply from %s' \
                              % [host.hostname for host in self._hostlist
                                               if host.state == 'started']
                        _LOGGER.error(msg)
                        for host in self._hostlist:
                            if host.state == 'started':
                                if host.proc is not None:
                                    host.proc.terminate()
                                if host.reverse_cleanup is not None:
                                    host.reverse_cleanup[0](
                                        *host.reverse_cleanup[1:])
                                host.state = 'failed'
                        continue

                    conn = retval[0]
                    i, address, pubkey_text = conn.recv()
                    conn.close()

                    other_host = self._hostlist[i]
                    if address is None:
                        _LOGGER.error('Host %s died: %s', other_host.hostname,
                                      pubkey_text)  # Exception text.
                        other_host.state = 'failed'
                        continue
                    try:
                        other_host.manager = \
                            HostManager.from_address(address, self._authkey,
                                                     other_host)
                    except Exception as exc:
                        _LOGGER.error("Can't start manager for %s: %s",
                                      other_host.hostname,
                                      str(exc) or repr(exc))
                        if other_host.proc is not None:
                            other_host.proc.terminate()
                        other_host.state = 'failed'
                        continue
                    else:
                        other_host.state = 'up'
                        if pubkey_text:
                            other_host.manager._pubkey = \
                                decode_public_key(pubkey_text)
                        host_processed = True
                        _LOGGER.debug('Host %s is now up', other_host.hostname)
                        self._up.append(other_host)

            # See if there are still hosts to wait for.
            waiting = []
            for host in self._hostlist:
                host.poll()
                if host.state == 'init' or host.state == 'started':
                    waiting.append(host)
            if waiting:
                if not host_processed:
                    retry += 1
                    if retry < 300:  # ~60 seconds.
                        time.sleep(0.2)
                    else:
                        _LOGGER.warning('Cluster startup timeout,'
                                        ' hosts not started:')
                        for host in waiting:
                            _LOGGER.warning('    %s (%s) in dir %s',
                                            host.hostname, host.state,
                                            host.tempdir)
                        break
            else:
                break

        self._up = sorted(self._up, key=lambda host: host.hostname)

        # So our class defined shutdown() is called before the superclass
        # installed shutdown().
        self._base_shutdown = self.shutdown
        del self.shutdown

        if len(self._up) < 1:
            raise RuntimeError('No hosts successfully started')
コード例 #28
0
    def start(self):
        """ Start this manager and all remote managers. """
        super(Cluster, self).start()
        hostname = socket.getfqdn()
        listener = connection.Listener(address=(hostname, 0),
                                       authkey=self._authkey,
                                       backlog=5)  # Default is 1.
# TODO: support multiple addresses if multiple networks are attached.

        # Start managers in separate thread to avoid losing connections.
        starter = threading.Thread(target=self._start_hosts,
                                   args=(listener.address, get_credentials()))
        starter.daemon = True
        starter.start()

        # Accept callback connections from started managers.
        waiting = ['']
        retry = 0
        while waiting:
            host_processed = False
            for host in self._hostlist:
                host.poll()
                if host.state == 'started':
                    # Accept conection from *any* host.
                    _LOGGER.debug('waiting for a connection, host %s',
                                  host.hostname)
                    # This will hang if server doesn't receive our address.
                    conn = listener.accept()
                    i, address, pubkey_text = conn.recv()
                    conn.close()
                    other_host = self._hostlist[i]
                    if address is None:
                        _LOGGER.error('Host %s died: %s', other_host.hostname,
                                      pubkey_text)  # Exception text.
                        continue

                    other_host.manager = HostManager.from_address(address,
                                                                  self._authkey)
                    other_host.state = 'up'
                    if pubkey_text:
                        other_host.manager._pubkey = \
                            decode_public_key(pubkey_text)
                    host_processed = True
                    _LOGGER.debug('Host %s is now up', other_host.hostname)
                    self._up.append(other_host)

            # See if there are still hosts to wait for.
            waiting = []
            for host in self._hostlist:
                host.poll()
                if host.state == 'init' or host.state == 'started':
                    waiting.append(host)
            if waiting:
                if not host_processed:
                    retry += 1
                    if retry < 300:  # ~60 seconds.
                        time.sleep(0.2)
                    else:
                        _LOGGER.warning('Cluster startup timeout,'
                                        ' hosts not started:')
                        for host in waiting:
                            _LOGGER.warning('    %s (%s) in dir %s',
                                            host.hostname, host.state,
                                            host.tempdir)
                        break
            else:
                break

        self._up = sorted(self._up, key=lambda host: host.hostname)

        self._base_shutdown = self.shutdown
        del self.shutdown
コード例 #29
0
    def create(self, typname, version=None, server=None,
               res_desc=None, **ctor_args):
        """
        Create a new `typname` object in `server` or a new
        :class:`ObjectServer`.  Returns a proxy for for the new object.
        Starts servers in a subdirectory of the current directory.

        typname: string
            Type of object to create. If null, then a proxy for the new
            :class:`ObjServer` is returned.

        version: string or None
            Version of `typname` to create.

        server: proxy
            :class:`ObjServer` on which to create `typname`.
            If none, then a new server is created.

        res_desc: dict or None
            Required resources. Currently not used.

        ctor_args: dict
            Other constructor arguments.
            If `name` or `allowed_users` are specified, they are used when
            creating the :class:`ObjServer`. If no `allowed_users` are
            specified, the server is private to the current user.
        """
        self._logger.info('create typname %r, version %r server %s,'
                          ' res_desc %s, args %s', typname, version, server,
                          res_desc, ctor_args)

        if server is None:
            name = ctor_args.get('name', '')
            if not name:
                name = 'Server_%d' % (len(self._managers) + 1)

            allowed_users = ctor_args.get('allowed_users')
            if not allowed_users:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}
            else:
                del ctor_args['allowed_users']

            if self._address is None or \
               isinstance(self._address, basestring) or \
               self._allow_tunneling:
                # Local access only via pipe if factory accessed by pipe
                # or factory is accessed via tunnel.
                address = None
            else:
                # Network access via same IP as factory, system-selected port.
                address = (self._address[0], 0)

            manager = self.manager_class(address, self._authkey, name=name,
                                         allowed_users=allowed_users)
            root_dir = name
            count = 1
            while os.path.exists(root_dir):
                count += 1
                root_dir = '%s_%d' % (name, count)
            os.mkdir(root_dir)

            # On Windows, when running the full test suite under Nose,
            # starting the process starts a new Nose test session, which
            # will eventually get here and start a new Nose session, which...
            orig_main = None
            if sys.platform == 'win32':  #pragma no cover
                scripts = ('openmdao-script.py', 'openmdao_test-script.py')
                if sys.modules['__main__'].__file__.endswith(scripts):
                    orig_main = sys.modules['__main__'].__file__
                    sys.modules['__main__'].__file__ = \
                        pkg_resources.resource_filename('openmdao.main',
                                                        'objserverfactory.py')
            owner = get_credentials()
            self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s',
                             owner, name, root_dir)
            try:
                manager.start(cwd=root_dir,
                              log_level=self._logger.getEffectiveLevel())
            finally:
                if orig_main is not None:  #pragma no cover
                    sys.modules['__main__'].__file__ = orig_main

            self._logger.info('new server %r for %s', name, owner)
            self._logger.info('    in dir %s', root_dir)
            self._logger.info('    listening on %s', manager.address)
            server_class = getattr(manager, self.server_classname)
            server = server_class(name=name, allow_shell=self._allow_shell,
                                  allowed_types=self._allowed_types)
            self._managers[server] = (manager, root_dir, owner)

        if typname:
            obj = server.create(typname, version, None, res_desc, **ctor_args)
        else:
            obj = server

        self._logger.log(LOG_DEBUG2, 'create returning %r at %r',
                         obj, obj._token.address)
        return obj
コード例 #30
0
    def _start(self):
        """ Start evaluating cases concurrently. """
        # Need credentials in case we're using a PublicKey server.
        credentials = get_credentials()

        # Determine maximum number of servers available.
        resources = {
            'required_distributions':self._egg_required_distributions,
            'orphan_modules':self._egg_orphan_modules,
            'python_version':sys.version[:3]}
        if self.extra_resources:
            resources.update(self.extra_resources)
        max_servers = RAM.max_servers(resources)
        self._logger.debug('max_servers %d', max_servers)
        if max_servers <= 0:
            msg = 'No servers supporting required resources %s' % resources
            self.raise_exception(msg, RuntimeError)

        # Kick off initial wave of cases.
        self._server_lock = threading.Lock()
        self._reply_q = Queue.Queue()
        self._generation += 1
        n_servers = 0
        while n_servers < max_servers:
            if not self._more_to_go():
                break

            # Get next case. Limits servers started if max_servers > cases.
            try:
                case = self._iter.next()
            except StopIteration:
                if not self._rerun:
                    self._iter = None
                    self._seqno = 0
                    break

            self._seqno += 1
            self._todo.append((case, self._seqno))

            # Start server worker thread.
            n_servers += 1
            name = '%s_%d_%d' % (self.name, self._generation, n_servers)
            self._logger.debug('starting worker for %r', name)
            self._servers[name] = None
            self._in_use[name] = True
            self._server_cases[name] = None
            self._server_states[name] = _EMPTY
            self._load_failures[name] = 0
            server_thread = threading.Thread(target=self._service_loop,
                                             args=(name, resources,
                                                   credentials, self._reply_q))
            server_thread.daemon = True
            try:
                server_thread.start()
            except thread.error:
                self._logger.warning('worker thread startup failed for %r',
                                     name)
                self._in_use[name] = False
                break

            if sys.platform != 'win32':
                # Process any pending events.
                while self._busy():
                    try:
                        name, result, exc = self._reply_q.get(True, 0.01)
                    except Queue.Empty:
                        break  # Timeout.
                    else:
                        # Difficult to force startup failure.
                        if self._servers[name] is None:  #pragma nocover
                            self._logger.debug('server startup failed for %r',
                                               name)
                            self._in_use[name] = False
                        else:
                            self._in_use[name] = self._server_ready(name)

        if sys.platform == 'win32':  #pragma no cover
            # Don't start server processing until all servers are started,
            # otherwise we have egg removal issues.
            for name in self._in_use.keys():
                name, result, exc = self._reply_q.get()
                if self._servers[name] is None:
                    self._logger.debug('server startup failed for %r', name)
                    self._in_use[name] = False

            # Kick-off started servers.
            for name in self._in_use.keys():
                if self._in_use[name]:
                    self._in_use[name] = self._server_ready(name)

        # Continue until no servers are busy.
        while self._busy():
            if self._more_to_go():
                timeout = None
            else:
                # Don't wait indefinitely for a server we don't need.
                # This has happened with a server that got 'lost'
                # in RAM.allocate()
                timeout = 60
            try:
                name, result, exc = self._reply_q.get(timeout=timeout)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                msgs = []
                for name, in_use in self._in_use.items():
                    if in_use:
                        try:
                            server = self._servers[name]
                            info = self._server_info[name]
                        except KeyError:
                            msgs.append('%r: no startup reply' % name)
                            self._in_use[name] = False
                        else:
                            state = self._server_states[name]
                            if state not in (_LOADING, _EXECUTING):
                                msgs.append('%r: %r %s %s'
                                            % (name, self._servers[name],
                                               state, self._server_info[name]))
                                self._in_use[name] = False
                if msgs:
                    self._logger.error('Timeout waiting with nothing left to do:')
                    for msg in msgs:
                        self._logger.error('    %s', msg)
            else:
                self._in_use[name] = self._server_ready(name)

        # Shut-down (started) servers.
        self._logger.debug('Shut-down (started) servers')
        for queue in self._queues.values():
            queue.put(None)
        for i in range(len(self._queues)):
            try:
                name, status, exc = self._reply_q.get(True, 60)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                pass
            else:
                if name in self._queues:  # 'Stale' worker can reply *late*.
                    del self._queues[name]
        # Hard to force worker to hang, which is handled here.
        for name in self._queues.keys():  #pragma no cover
            self._logger.warning('Timeout waiting for %r to shut-down.', name)
コード例 #31
0
    def create(self, typname, version=None, server=None,
               res_desc=None, **ctor_args):
        """
        Create a new `typname` object in `server` or a new
        :class:`ObjectServer`.  Returns a proxy for for the new object.
        Starts servers in a subdirectory of the current directory.

        typname: string
            Type of object to create. If null, then a proxy for the new
            :class:`ObjServer` is returned.

        version: string or None
            Version of `typname` to create.

        server: proxy
            :class:`ObjServer` on which to create `typname`.
            If none, then a new server is created.

        res_desc: dict or None
            Required resources. ``working_directory`` is used to set a
            created server's directory, other keys are ignored.
            If `allow_shell` has been set, then an absolute directory
            reference may be used (including '~' expansion). If not, then
            the reference must be relative and the working directory will be
            relative to the factory's directory. If the directory already
            exists, a new name will be used of the form ``<directory>_N``

        ctor_args: dict
            Other constructor arguments.
            If `name` or `allowed_users` are specified, they are used when
            creating the :class:`ObjServer`. If no `allowed_users` are
            specified, the server is private to the current user.
        """
        self._logger.info('create typname %r, version %r server %s,'
                          ' res_desc %s, args %s', typname, version, server,
                          res_desc, ctor_args)

        if server is None:
            name = ctor_args.get('name', '')
            if not name:
                name = 'Server_%d' % (len(self._managers) + 1)

            allowed_users = ctor_args.get('allowed_users')
            if not allowed_users:
                credentials = get_credentials()
                allowed_users = {credentials.user: credentials.public_key}
            else:
                del ctor_args['allowed_users']

            if self._address is None or \
               isinstance(self._address, basestring) or \
               self._allow_tunneling:
                # Local access only via pipe if factory accessed by pipe
                # or factory is accessed via tunnel.
                address = None
            else:
                # Network access via same IP as factory, system-selected port.
                address = (self._address[0], 0)

            manager = self.manager_class(address, self._authkey, name=name,
                                         allowed_users=allowed_users)

            # Set (unique) working directory of server.
            # Server cleanup removes this directory, so we avoid any
            # existing directory to not delete existing files.
            base = None
            if res_desc is not None:
                base = res_desc.get('working_directory')
                if base:
                    if self._allow_shell:  # Absolute allowed.
                        base = os.path.expanduser(base)
                    elif os.path.isabs(base) or base.startswith('..'):
                        raise ValueError('working_directory %r must be subdirectory'
                                         % base)
                    res_desc = res_desc.copy()
                    del res_desc['working_directory']
            if not base:
                base = name
            count = 1
            root_dir = base
            while os.path.exists(root_dir):
                count += 1
                root_dir = '%s_%d' % (base, count)
            os.mkdir(root_dir)

            # On Windows, when running the full test suite under Nose,
            # starting the process starts a new Nose test session, which
            # will eventually get here and start a new Nose session, which...
            orig_main = None
            if sys.platform == 'win32':  #pragma no cover
                scripts = ('openmdao-script.py', 'openmdao_test-script.py')
                try:
                    main_file = sys.modules['__main__'].__file__
                except AttributeError:
                    pass
                else:
                    if main_file.endswith(scripts):
                        orig_main = main_file
                        sys.modules['__main__'].__file__ = \
                            pkg_resources.resource_filename('openmdao.main',
                                                            'objserverfactory.py')
            owner = get_credentials()
            self._logger.log(LOG_DEBUG2, '%s starting server %r in dir %s',
                             owner, name, root_dir)
            try:
                manager.start(cwd=root_dir,
                              log_level=self._logger.getEffectiveLevel())
            finally:
                if orig_main is not None:  #pragma no cover
                    sys.modules['__main__'].__file__ = orig_main

            self._logger.info('new server %r for %s', name, owner)
            self._logger.info('    in dir %s', root_dir)
            self._logger.info('    listening on %s', manager.address)
            server_class = getattr(manager, self.server_classname)
            server = server_class(name=name, allow_shell=self._allow_shell,
                                  allowed_types=self._allowed_types)
            self._managers[server] = (manager, root_dir, owner)

        if typname:
            obj = server.create(typname, version, None, res_desc, **ctor_args)
        else:
            obj = server

        self._logger.log(LOG_DEBUG2, 'create returning %r at %r',
                         obj, obj._token.address)
        return obj
コード例 #32
0
    def start_manager(self, index, authkey, address, files, allow_shell=False):
        """
        Launch remote manager process via `ssh`.
        The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid
        removal of the temporary directory used on the host.

        index: int
            Index in parent cluster.

        authkey: string
            Authorization key used to connect to host server.

        address: (ip_addr, port) or string referring to pipe.
            Address to use to connect back to parent.

        files: list(string)
            Files to be sent to support server startup.

        allow_shell: bool
            If True, :meth:`execute_command` and :meth:`load_model` are allowed
            in created servers. Use with caution!

        """
        try:
            self._check_ssh()
        except RuntimeError:
            self.state = 'failed'
            return

        self.tempdir = self._copy_to_remote(files)
        if not self.tempdir:
            self.state = 'failed'
            return
        _LOGGER.debug('startup files copied to %s:%s',
                      self.hostname, self.tempdir)

        if self.tunnel_incoming:
            _LOGGER.debug('setup reverse tunnel from %s to %s:%s',
                          self.hostname, address[0], address[1])
            address, cleanup = \
                setup_reverse_tunnel(self.hostname, address[0], address[1], 
                                     identity=self.identity_filename)
            self.reverse_cleanup = cleanup

        cmd = self._ssh_cmd()
        cmd.extend([self.hostname, self.beforestart, self.python, '-c',
                   '"import sys;'
                   ' sys.path.append(\'.\');'
                   ' import os;'
                   ' os.chdir(\'%s\');'
                   ' from mp_distributing import main;'
                   ' main()"' % self.tempdir])
        self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}

        # Tell the server what name to bind to
        # (in case it has multiple interfaces).
        user, remote_name = self.hostname.split('@')
        
        data = dict(
            name='BoostrappingHost', index=index, hostname=remote_name,
            # Avoid lots of SUBDEBUG messages.
            dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG),
            dir=self.tempdir, authkey=str(authkey), allowed_users=allowed_users,
            allow_shell=allow_shell, allow_tunneling=self.tunnel_outgoing,
            parent_address=address, registry=self.registry,
            keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0'))

        # Windows can't handle binary on stdin.
        dump = cPickle.dumps(data, cPickle.HIGHEST_PROTOCOL)
        dump = base64.b64encode(dump)
        _LOGGER.debug('sending %s config info (%s)', self.hostname, len(dump))
        self.proc.stdin.write(dump)
        self.proc.stdin.close()
        time.sleep(1)  # Give the proc time to register startup problems.
        self.poll()
        if self.state != 'failed':
            self.state = 'started'
コード例 #33
0
    def start_manager(self, index, authkey, address, files, allow_shell=False):
        """
        Launch remote manager process via `ssh`.
        The environment variable ``OPENMDAO_KEEPDIRS`` can be used to avoid
        removal of the temporary directory used on the host.

        index: int
            Index in parent cluster.

        authkey: string
            Authorization key used to connect to host server.

        address: (ip_addr, port) or string referring to pipe.
            Address to use to connect back to parent.

        files: list(string)
            Files to be sent to support server startup.

        allow_shell: bool
            If True, :meth:`execute_command` and :meth:`load_model` are allowed
            in created servers. Use with caution!
        """
        try:
            _check_ssh(self.hostname)
        except Exception:
            self.state = 'failed'
            return

        self.tempdir = _copy_to_remote(self.hostname, files, self.python)
        _LOGGER.debug('startup files copied to %s:%s', self.hostname,
                      self.tempdir)
        cmd = copy.copy(_SSH)
        cmd.extend([
            self.hostname, self.python, '-c',
            '"import sys;'
            ' sys.path.append(\'.\');'
            ' import os;'
            ' os.chdir(\'%s\');'
            ' from mp_distributing import main;'
            ' main()"' % self.tempdir
        ])
        self.proc = subprocess.Popen(cmd,
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

        credentials = get_credentials()
        allowed_users = {credentials.user: credentials.public_key}

        data = dict(
            name='BoostrappingHost',
            index=index,
            # Avoid lots of SUBDEBUG messages.
            dist_log_level=max(_LOGGER.getEffectiveLevel(), logging.DEBUG),
            dir=self.tempdir,
            authkey=str(authkey),
            allowed_users=allowed_users,
            allow_shell=allow_shell,
            parent_address=address,
            registry=self.registry,
            keep_dirs=os.environ.get('OPENMDAO_KEEPDIRS', '0'))
        cPickle.dump(data, self.proc.stdin, cPickle.HIGHEST_PROTOCOL)
        self.proc.stdin.close()
        # TODO: put timeout in accept() to avoid this hack.
        time.sleep(1)  # Give the proc time to register startup problems.
        self.poll()
        if self.state != 'failed':
            self.state = 'started'
コード例 #34
0
    def test_credentials(self):
        logging.debug('')
        logging.debug('test_credentials')

        # Basic form.
        owner = Credentials()
        if sys.platform == 'win32' and not HAVE_PYWIN32:
            self.assertEqual('%s' % owner, owner.user+' (transient)')
        else:
            self.assertEqual('%s' % owner, owner.user)

        # Comparison.
        user = Credentials()
        self.assertEqual(user, owner)
        user.user = '******'
        self.assertNotEqual(user, owner)
        self.assertNotEqual(user, 'xyzzy')

        # Thread storage.
        try:
            del threading.current_thread().credentials  # Ensure empty.
        except AttributeError:
            pass
        self.assertEqual(get_credentials(), owner)

        # Sign/verify.
        encoded = owner.encode()
        Credentials.verify(encoded, allowed_users=None)  # 'First sighting'.
        Credentials.verify(encoded, allowed_users=None)  # Cached verification.
        data, signature, client_creds = encoded

        encoded = (data[:1], signature, client_creds)
        assert_raises(self, 'Credentials.verify(encoded, None)',
                      globals(), locals(), CredentialsError, 'Invalid data')

        encoded = (data[:-1], signature, client_creds)
        assert_raises(self, 'Credentials.verify(encoded, None)',
                      globals(), locals(), CredentialsError, 'Invalid signature')

        encoded = (data, signature[:-1], client_creds)
        assert_raises(self, 'Credentials.verify(encoded, None)',
                      globals(), locals(), CredentialsError, 'Invalid signature')

        newline = data.find('\n')  # .user
        newline = data.find('\n', newline+1)  # .transient
        # Expecting '-'
        mangled = data[:newline+1] + '*' + data[newline+2:]
        encoded = (mangled, signature, client_creds)
        assert_raises(self, 'Credentials.verify(encoded, None)',
                      globals(), locals(), CredentialsError, 'Invalid key')

        # Detect mismatched key.
        get_key_pair(owner.user, overwrite_cache=True)
        spook = Credentials()
        encoded = spook.encode()
        assert_raises(self, 'Credentials.verify(encoded, None)',
                      globals(), locals(), CredentialsError,
                      'Public key mismatch')

        # Check if remote access.
        self.assertFalse(remote_access())