Example #1
0
    def test_allocator(self):
        logging.debug('')
        logging.debug('test_allocator')

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator='LocalHost'))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name,
                                           localhost=True)) # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy('test_server', {}, {})
        try:
            self.assertEqual(server.name, 'NAS_Allocator/test_server')
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, 'twisty', 'narrow', 'passages')
            self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages'))
            self.assertTrue(server.isdir('.'))
            self.assertEqual(sorted(server.listdir('.')),
                             ['openmdao_log.txt', 'stderr', 'stdout'])
        finally:
            self.allocator.release(server)
Example #2
0
    def test_allocator(self):
        logging.debug('')
        logging.debug('test_allocator')

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator='LocalHost'))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name,
                                           localhost=True)) # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy('test_server', {}, {})
        try:
            self.assertEqual(server.name, 'NAS_Allocator/test_server')
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, 'twisty', 'narrow', 'passages')
            self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages'))
            self.assertTrue(server.isdir('.'))
            self.assertEqual(sorted(server.listdir('.')),
                             ['openmdao_log.txt', 'stderr', 'stdout'])
        finally:
            self.allocator.release(server)
    def _start(self):
        """ Start evaluating cases concurrently. """
        # Need credentials in case we're using a PublicKey server.
        credentials = get_credentials()

        # Determine maximum number of servers available.
        resources = {
            'required_distributions': self._egg_required_distributions,
            'orphan_modules': self._egg_orphan_modules,
            'python_version': sys.version[:3]
        }
        if self.extra_resources:
            resources.update(self.extra_resources)
        max_servers = RAM.max_servers(resources)
        self._logger.debug('max_servers %d', max_servers)
        if max_servers <= 0:
            msg = 'No servers supporting required resources %s' % resources
            self.raise_exception(msg, RuntimeError)

        # Kick off initial wave of cases.
        self._server_lock = threading.Lock()
        self._reply_q = Queue.Queue()
        self._generation += 1
        n_servers = 0
        while n_servers < max_servers:
            if not self._more_to_go():
                break

            # Get next case. Limits servers started if max_servers > cases.
            try:
                case = self._iter.next()
            except StopIteration:
                if not self._rerun:
                    self._iter = None
                    break
Example #4
0
def run_suite(resource_desc=None, name=None):
    """ Run suite of tests using `resource_desc` and resord under `name`. """
    resource_desc = resource_desc or {}
    name = name or ''
    print '\n%s' % name

    initial = 0.01
    limit = 20
    results = {}

    max_servers = ResourceAllocationManager.max_servers(resource_desc)
    print 'max servers', max_servers

    model = CID()
    model.driver.reload_model = False
    model.driver.sequential = False

    # Save to an egg to avoid analysis overhead during run_test().
    print '\nInitializing egg module analysis'
    template = Case(inputs=[('sleeper.delay', None, 0.01)])
    model.driver.iterator = Iterator(template)
    model.driver.recorders = [Recorder(model.driver.iterator, 1000)]
    start = time.time()
    egg_filename, required_distributions, orphan_modules = \
        model.save_to_egg('caseperf', '0')
    et = time.time() - start
    print '    done in %.2f' % et
    os.remove(egg_filename)

    print
    results = run_test(model, initial, limit, max_servers)
    record_results(results, name)
def run_suite(resource_desc=None, name=None):
    """ Run suite of tests using `resource_desc` and resord under `name`. """
    resource_desc = resource_desc or {}
    name = name or ''
    print '\n%s' % name

    initial = 0.01
    limit = 20
    results = {}

    max_servers = ResourceAllocationManager.max_servers(resource_desc)
    print 'max servers', max_servers

    model = CID()
    model.driver.reload_model = False
    model.driver.sequential = False

    # Save to an egg to avoid analysis overhead during run_test().
    print '\nInitializing egg module analysis'
    template = Case(inputs=[('sleeper.delay', None, 0.01)])
    model.driver.iterator = Iterator(template)
    model.driver.recorders = [Recorder(model.driver.iterator, 1000)]
    start = time.time()
    egg_filename, required_distributions, orphan_modules = \
        model.save_to_egg('caseperf', '0')
    et = time.time() - start
    print '    done in %.2f' % et
    os.remove(egg_filename)

    print
    results = run_test(model, initial, limit, max_servers)
    record_results(results, name)
    def test_allocator(self):
        logging.debug("")
        logging.debug("test_allocator")

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator="LocalHost"))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True))  # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy("test_server", {}, {})
        try:
            self.assertEqual(server.name, "NAS_Allocator/test_server")
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, "twisty", "narrow", "passages")
            self.assertEqual(retval, (123, "twisty", "narrow", "passages"))
            self.assertTrue(server.isdir("."))
            self.assertEqual(sorted(server.listdir(".")), ["openmdao_log.txt", "stderr", "stdout"])
        finally:
            self.allocator.release(server)
Example #7
0
def main():
    """ Configure a cluster and use it. """
    enable_console(logging.DEBUG)
    logging.getLogger().setLevel(0)
    print 'Client PID', os.getpid()

    # Configure cluster.
    cluster_name = 'EC2Cluster'
    machines = []
    if USE_EC2:
        # The identity file used to access EC2 via ssh.
        identity_filename = os.path.expanduser('~/.ssh/lovejoykey')
        identity_filename += '.ppk' if sys.platform == 'win32' else '.pem'

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.04.529682' \
                   '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.03.113077' \
                   '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.05.434412' \
                   '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.20.17.379627' \
                   '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.19.49.348885' \
                   '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

    #        machines.append(ClusterHost(
    #            hostname='viper.grc.nasa.gov',
    #            python='OpenMDAO-Framework/devenv/bin/python',
    #            tunnel_incoming=True, tunnel_outgoing=True,
    #            identity_filename=None))
    else:
        # Trivial local 'cluster' for debugging without remote host issues.
        machines.append(
            ClusterHost(hostname=socket.getfqdn(), python=sys.executable))
    #        machines.append(ClusterHost(
    #            hostname='viper.grc.nasa.gov',
    #            python='OpenMDAO-Framework/devenv/bin/python',
    #            tunnel_incoming=True, tunnel_outgoing=True,
    #            identity_filename=None))

    # Start it.
    cluster = ClusterAllocator(cluster_name,
                               machines,
                               allow_shell=True,
                               method='load-average')
    #                               method='greedy')
    #                               method='round-robin')
    print 'Cluster initialized'
    RAM.insert_allocator(0, cluster)

    n_servers = RAM.max_servers(dict(allocator=cluster_name))
    print n_servers, 'Servers:'
    for name in RAM.get_hostnames(
            dict(allocator=cluster_name, min_cpus=n_servers)):
        print '   ', name

    # Create model.
    top = GPOptimization()

    # Configure DOE.
    top.driver.sequential = False  # Run concurrently across cluster.
    top.driver.reload_model = False
    # Force use of only cluster hosts by adding this requirement.
    top.driver.extra_resources = dict(allocator=cluster_name)
    # This is necessary more often than it should be.
    top.driver.ignore_egg_requirements = True

    # Perform the optimization.
    top.run()
    def _start(self):
        """ Start evaluating cases concurrently. """
        # Need credentials in case we're using a PublicKey server.
        credentials = get_credentials()

        # Determine maximum number of servers available.
        resources = {
            'required_distributions':self._egg_required_distributions,
            'orphan_modules':self._egg_orphan_modules,
            'python_version':sys.version[:3]}
        if self.extra_resources:
            resources.update(self.extra_resources)
        max_servers = RAM.max_servers(resources)
        self._logger.debug('max_servers %d', max_servers)
        if max_servers <= 0:
            msg = 'No servers supporting required resources %s' % resources
            self.raise_exception(msg, RuntimeError)

        # Kick off initial wave of cases.
        self._server_lock = threading.Lock()
        self._reply_q = Queue.Queue()
        self._generation += 1
        n_servers = 0
        while n_servers < max_servers:
            if not self._more_to_go():
                break

            # Get next case. Limits servers started if max_servers > cases.
            try:
                case = self._iter.next()
            except StopIteration:
                if not self._rerun:
                    self._iter = None
                    self._seqno = 0
                    break

            self._seqno += 1
            self._todo.append((case, self._seqno))

            # Start server worker thread.
            n_servers += 1
            name = '%s_%d_%d' % (self.name, self._generation, n_servers)
            self._logger.debug('starting worker for %r', name)
            self._servers[name] = None
            self._in_use[name] = True
            self._server_cases[name] = None
            self._server_states[name] = _EMPTY
            self._load_failures[name] = 0
            server_thread = threading.Thread(target=self._service_loop,
                                             args=(name, resources,
                                                   credentials, self._reply_q))
            server_thread.daemon = True
            try:
                server_thread.start()
            except thread.error:
                self._logger.warning('worker thread startup failed for %r',
                                     name)
                self._in_use[name] = False
                break

            if sys.platform != 'win32':
                # Process any pending events.
                while self._busy():
                    try:
                        name, result, exc = self._reply_q.get(True, 0.01)
                    except Queue.Empty:
                        break  # Timeout.
                    else:
                        # Difficult to force startup failure.
                        if self._servers[name] is None:  #pragma nocover
                            self._logger.debug('server startup failed for %r',
                                               name)
                            self._in_use[name] = False
                        else:
                            self._in_use[name] = self._server_ready(name)

        if sys.platform == 'win32':  #pragma no cover
            # Don't start server processing until all servers are started,
            # otherwise we have egg removal issues.
            for name in self._in_use.keys():
                name, result, exc = self._reply_q.get()
                if self._servers[name] is None:
                    self._logger.debug('server startup failed for %r', name)
                    self._in_use[name] = False

            # Kick-off started servers.
            for name in self._in_use.keys():
                if self._in_use[name]:
                    self._in_use[name] = self._server_ready(name)

        # Continue until no servers are busy.
        while self._busy():
            if self._more_to_go():
                timeout = None
            else:
                # Don't wait indefinitely for a server we don't need.
                # This has happened with a server that got 'lost'
                # in RAM.allocate()
                timeout = 60
            try:
                name, result, exc = self._reply_q.get(timeout=timeout)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                msgs = []
                for name, in_use in self._in_use.items():
                    if in_use:
                        try:
                            server = self._servers[name]
                            info = self._server_info[name]
                        except KeyError:
                            msgs.append('%r: no startup reply' % name)
                            self._in_use[name] = False
                        else:
                            state = self._server_states[name]
                            if state not in (_LOADING, _EXECUTING):
                                msgs.append('%r: %r %s %s'
                                            % (name, self._servers[name],
                                               state, self._server_info[name]))
                                self._in_use[name] = False
                if msgs:
                    self._logger.error('Timeout waiting with nothing left to do:')
                    for msg in msgs:
                        self._logger.error('    %s', msg)
            else:
                self._in_use[name] = self._server_ready(name)

        # Shut-down (started) servers.
        self._logger.debug('Shut-down (started) servers')
        for queue in self._queues.values():
            queue.put(None)
        for i in range(len(self._queues)):
            try:
                name, status, exc = self._reply_q.get(True, 60)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                pass
            else:
                if name in self._queues:  # 'Stale' worker can reply *late*.
                    del self._queues[name]
        # Hard to force worker to hang, which is handled here.
        for name in self._queues.keys():  #pragma no cover
            self._logger.warning('Timeout waiting for %r to shut-down.', name)
Example #9
0
def main():
    """ Configure a cluster and use it. """
    enable_console(logging.DEBUG)
    logging.getLogger().setLevel(0)
    print 'Client PID', os.getpid()

    # Configure cluster.
    cluster_name = 'EC2Cluster'
    machines = []
    if USE_EC2:
        # The identity file used to access EC2 via ssh.
        identity_filename = os.path.expanduser('~/.ssh/lovejoykey')
        identity_filename += '.ppk' if sys.platform == 'win32' else '.pem'

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.04.529682' \
                '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.03.113077' \
                '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.17.05.434412' \
                '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.20.17.379627' \
                '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

        machines.append(ClusterHost(
            hostname='*****@*****.**',
            python='setowns1_2013-05-06_09.19.49.348885' \
                '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python',
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))

#        machines.append(ClusterHost(
#            hostname='viper.grc.nasa.gov',
#            python='OpenMDAO-Framework/devenv/bin/python',
#            tunnel_incoming=True, tunnel_outgoing=True,
#            identity_filename=None))
    else:
        # Trivial local 'cluster' for debugging without remote host issues.
        machines.append(ClusterHost(hostname=socket.getfqdn(),
                                    python=sys.executable))
#        machines.append(ClusterHost(
#            hostname='viper.grc.nasa.gov',
#            python='OpenMDAO-Framework/devenv/bin/python',
#            tunnel_incoming=True, tunnel_outgoing=True,
#            identity_filename=None))

    # Start it.
    cluster = ClusterAllocator(cluster_name, machines, allow_shell=True,
                               method='load-average')
#                               method='greedy')
#                               method='round-robin')
    print 'Cluster initialized'
    RAM.insert_allocator(0, cluster)

    n_servers = RAM.max_servers(dict(allocator=cluster_name))
    print n_servers, 'Servers:'
    for name in RAM.get_hostnames(dict(allocator=cluster_name,
                                       min_cpus=n_servers)):
        print '   ', name

    # Create model.
    top = GPOptimization()

    # Configure DOE.
    top.driver.sequential = False   # Run concurrently across cluster.
    top.driver.reload_model = False
    # Force use of only cluster hosts by adding this requirement.
    top.driver.extra_resources = dict(allocator=cluster_name)
    # This is necessary more often than it should be.
    top.driver.ignore_egg_requirements = True

    # Perform the optimization.
    top.run()
    def test_remote(self):
        logging.debug('')
        logging.debug('test_remote')

        # Start remote server.
        server_dir = 'Factory'
        if os.path.exists(server_dir):
            shutil.rmtree(server_dir, onerror=onerror)
        os.mkdir(server_dir)
        os.chdir(server_dir)
        try:
            server, server_cfg = start_server()
            cfg = read_server_config(server_cfg)
            factory = None
            try:
                factory = connect(cfg['address'], cfg['port'],
                                  pubkey=cfg['key'])
                prefix = RAM._make_prefix(factory.host)
                remote = '%s_LocalHost' % prefix

                # Show no remotes currently in RAM.
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertFalse(remote in allocator_names)

                # Add remote server's allocator.
                RAM.add_remotes(factory)
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertTrue(remote in allocator_names)
                self.assertFalse(RAM.get_allocator(remote) is RAM.list_allocators()[0])
                self.assertTrue(RAM.get_allocator(remote) is RAM.list_allocators()[1])

                # Max servers.
                max_servers = RAM.max_servers(dict(allocator=remote))
                self.assertTrue(max_servers >= 0)  # Avoid host load issues.

                remote_alloc = RAM.get_allocator(remote)

                max_servers, info = \
                    remote_alloc.max_servers(dict(localhost=True))
                self.assertEqual(max_servers, 0)
                self.assertEqual(info, dict(localhost='requested local host'))

                max_servers, info = \
                    remote_alloc.max_servers(dict(allocator='LocalHost'))
                self.assertEqual(max_servers, 0)
                self.assertEqual(info, dict(allocator='wrong allocator'))

                estimate, info = \
                    remote_alloc.time_estimate(dict(allocator='LocalHost'))
                self.assertEqual(estimate, -2)
                self.assertEqual(info, dict(allocator='wrong allocator'))

                # Allocate, release.
                remote_server, info = RAM.allocate(dict(allocator=remote))
                RAM.release(remote_server)

                # Remove remote allocators.
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                for name in allocator_names:
                    if name.startswith(prefix):
                        RAM.remove_allocator(name)
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertFalse(remote in allocator_names)

            finally:
                if factory is not None:
                    factory.cleanup()
                server.terminate(timeout=10)
        finally:
            os.chdir('..')
            shutil.rmtree(server_dir, onerror=onerror)

        # Access local RAM in manner it would be accessed in the server.
        self.assertEqual(RAM._get_instance().get_total_allocators(), 1)
        self.assertTrue(RAM._get_instance().get_allocator_proxy(0)
                        is RAM.list_allocators()[0])
    def test_remote(self):
        logging.debug('')
        logging.debug('test_remote')

        # Start remote server.
        server_dir = 'Factory'
        if os.path.exists(server_dir):
            shutil.rmtree(server_dir, onerror=onerror)
        os.mkdir(server_dir)
        os.chdir(server_dir)
        try:
            server, server_cfg = start_server()
            cfg = read_server_config(server_cfg)
            factory = None
            try:
                factory = connect(cfg['address'],
                                  cfg['port'],
                                  pubkey=cfg['key'])
                prefix = RAM._make_prefix(factory.host)
                remote = '%s_LocalHost' % prefix

                # Show no remotes currently in RAM.
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertFalse(remote in allocator_names)

                # Add remote server's allocator.
                RAM.add_remotes(factory)
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertTrue(remote in allocator_names)
                self.assertFalse(
                    RAM.get_allocator(remote) is RAM.list_allocators()[0])
                self.assertTrue(
                    RAM.get_allocator(remote) is RAM.list_allocators()[1])

                # Max servers.
                max_servers = RAM.max_servers(dict(allocator=remote))
                self.assertTrue(max_servers >= 0)  # Avoid host load issues.

                remote_alloc = RAM.get_allocator(remote)

                max_servers, info = \
                    remote_alloc.max_servers(dict(localhost=True))
                self.assertEqual(max_servers, 0)
                self.assertEqual(info, dict(localhost='requested local host'))

                max_servers, info = \
                    remote_alloc.max_servers(dict(allocator='LocalHost'))
                self.assertEqual(max_servers, 0)
                self.assertEqual(info, dict(allocator='wrong allocator'))

                estimate, info = \
                    remote_alloc.time_estimate(dict(allocator='LocalHost'))
                self.assertEqual(estimate, -2)
                self.assertEqual(info, dict(allocator='wrong allocator'))

                # Allocate, release.
                remote_server, info = RAM.allocate(dict(allocator=remote))
                RAM.release(remote_server)

                # Remove remote allocators.
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                for name in allocator_names:
                    if name.startswith(prefix):
                        RAM.remove_allocator(name)
                allocator_names = \
                    [allocator.name for allocator in RAM.list_allocators()]
                logging.debug('%s', allocator_names)
                self.assertFalse(remote in allocator_names)

            finally:
                if factory is not None:
                    factory.cleanup()
                server.terminate(timeout=10)
        finally:
            os.chdir('..')
            shutil.rmtree(server_dir, onerror=onerror)

        # Access local RAM in manner it would be accessed in the server.
        self.assertEqual(RAM._get_instance().get_total_allocators(), 1)
        self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is
                        RAM.list_allocators()[0])