def setUp(self):
        self.orig_dir = os.getcwd()
        os.chdir(TestCase.directory)
        try:
            # Force use of fake 'ssh' and 'scp'.
            ssh = ("python", os.path.join(_TST_ROOT, "ssh.py"), _DMZ_ROOT)
            scp = ("python", os.path.join(_TST_ROOT, "scp.py"), _DMZ_ROOT)

            self.orig_ssh = protocol.configure_ssh(ssh)
            self.orig_scp = protocol.configure_scp(scp)

            # Avoid lots of polling log entries.
            if logging.getLogger().getEffectiveLevel() < logging.DEBUG:
                logging.getLogger().setLevel(logging.DEBUG)

            # Start RJE server.
            hostname = socket.gethostname()
            self.proc = start_server(hostname)

            # Create NAS_Allocator referring to server.
            logging.debug("create allocator")
            self.allocator = NAS_Allocator()
            parser = ConfigParser.ConfigParser()
            section = self.allocator.name
            parser.add_section(section)
            parser.set(section, "dmz_host", hostname)
            parser.set(section, "server_host", hostname)
            self.allocator.configure(parser)

            # Add allocator to RAM.
            RAM.add_allocator(self.allocator)
        except Exception:
            os.chdir(self.orig_dir)
            raise
    def test_allocator(self):
        logging.debug('')
        logging.debug('test_allocator')

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator='LocalHost'))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name,
                                           localhost=True)) # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy('test_server', {}, {})
        try:
            self.assertEqual(server.name, 'NAS_Allocator/test_server')
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, 'twisty', 'narrow', 'passages')
            self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages'))
            self.assertTrue(server.isdir('.'))
            self.assertEqual(sorted(server.listdir('.')),
                             ['openmdao_log.txt', 'stderr', 'stdout'])
        finally:
            self.allocator.release(server)
    def test_case_eval(self):
        logging.debug("")
        logging.debug("test_case_eval")

        # Run a fake job in style of CaseIteratorDriver.
        logging.debug("allocate server")
        server, server_info = RAM.allocate(dict(allocator=self.allocator.name))
        echo = set_as_top(Echo(1))
        egg_info = echo.save_to_egg("EchoTest", "1", need_requirements=False)
        egg_filename = egg_info[0]
        try:
            logging.debug("transfer egg")
            filexfer(None, egg_filename, server, egg_filename, "b")

            logging.debug("load model")
            tlo = server.load_model(egg_filename)

            logging.debug("set input")
            tlo.set("inp_0", 42)

            logging.debug("run")
            tlo.run()

            logging.debug("get output")
            output = tlo.get("out_0")
            self.assertEqual(output, 42)
        finally:
            os.remove(egg_filename)
            logging.debug("release")
            RAM.release(server)
    def setUp(self):
        # Save existing RAM instance and force a rebuild.
        self.orig_ram = RAM._RAM
        RAM._RAM = None
        RAM.configure('')

        self.user = getpass.getuser()
        self.node = platform.node()
        self.name = self.node.replace('.', '_')
        self.python = find_python()
        self.cluster = None

        if sys.platform == 'win32' or self.user not in SSH_USERS:
            self.skip_ssh = True
        else:
            self.skip_ssh = False

        self.machines = []
        self.machines.append({'hostname': self.node,
                              'python': self.python})

        # Ensure we aren't held up by local host load problems.
        for allocator in RAM.list_allocators():
            if allocator.name == 'LocalHost':
                self.local = allocator
                self.local.max_load = 10
                break
        else:
            raise RuntimeError('No LocalHost allocator!?')
    def test_request(self):
        logging.debug('')
        logging.debug('test_request')

        assembly = Assembly()
        comp1 = assembly.add('comp1', ExtCode())
        comp2 = assembly.add('comp2', ExtCode())
        sub = assembly.add('sub', Assembly())
        comp3 = sub.add('comp3', ExtCode())

        comp1.resources = dict(min_cpus=10,
                               max_cpus=10,
                               resource_limits=dict(virtual_memory=100,
                                                    cpu_time=120),
                               rerunnable=True,
                               accounting_id='frobozz',
                               queue_name='debug',
                               job_category='MPI')

        comp2.resources = dict(max_cpus=2,
                               resource_limits=dict(wallclock_time=1000000))

        comp3.resources = dict(min_cpus=200,
                               resource_limits=dict(virtual_memory=20,
                                                    cpu_time=1000,
                                                    wallclock_time=500),
                               rerunnable=True,
                               accounting_id='frobozz',
                               queue_name='debug',
                               job_category='MPI')

        req = RAM.max_request(assembly)
        expected = dict(min_cpus=200,
                        max_cpus=200,
                        resource_limits=dict(virtual_memory=100,
                                             cpu_time=1000,
                                             wallclock_time=1000000))
        logging.debug('req: %r', req)
        logging.debug('exp: %r', expected)
        self.assertEqual(req, expected)

        req = RAM.total_request(assembly)
        expected = dict(min_cpus=200,
                        max_cpus=200,
                        resource_limits=dict(virtual_memory=100,
                                             cpu_time=1120,
                                             wallclock_time=1000500),
                        rerunnable=True,
                        accounting_id='frobozz',
                        queue_name='debug',
                        job_category='MPI')
        logging.debug('req: %r', req)
        logging.debug('exp: %r', expected)
        self.assertEqual(req, expected)

        comp3.resources['accounting_id'] = 'xyzzy'
        assert_raises(self, 'RAM.total_request(assembly)',
                      globals(), locals(), ValueError,
                      "Incompatible settings for 'accounting_id':"
                      " 'xyzzy' vs. 'frobozz'")
Beispiel #6
0
    def setUp(self):
        self.orig_dir = os.getcwd()
        os.chdir(TestCase.directory)
        try:
            # Force use of fake 'ssh' and 'scp'.
            ssh = ('python', os.path.join(_TST_ROOT, 'ssh.py'), _DMZ_ROOT)
            scp = ('python', os.path.join(_TST_ROOT, 'scp.py'), _DMZ_ROOT)

            self.orig_ssh = protocol.configure_ssh(ssh)
            self.orig_scp = protocol.configure_scp(scp)

            # Avoid lots of polling log entries.
            if logging.getLogger().getEffectiveLevel() < logging.DEBUG:
                logging.getLogger().setLevel(logging.DEBUG)

            # Start RJE server.
            hostname = socket.gethostname()
            self.proc = start_server(hostname)

            # Create NAS_Allocator referring to server.
            logging.debug('create allocator')
            self.allocator = NAS_Allocator()
            parser = ConfigParser.ConfigParser()
            section = self.allocator.name
            parser.add_section(section)
            parser.set(section, 'dmz_host', hostname)
            parser.set(section, 'server_host', hostname)
            self.allocator.configure(parser)

            # Add allocator to RAM.
            RAM.add_allocator(self.allocator)
        except Exception:
            os.chdir(self.orig_dir)
            raise
Beispiel #7
0
    def test_case_eval(self):
        logging.debug('')
        logging.debug('test_case_eval')

        # Run a fake job in style of CaseIteratorDriver.
        logging.debug('allocate server')
        server, server_info = RAM.allocate(dict(allocator=self.allocator.name))
        echo = set_as_top(Echo(1))
        egg_info = echo.save_to_egg('EchoTest', '1', need_requirements=False)
        egg_filename = egg_info[0]
        try:
            logging.debug('transfer egg')
            filexfer(None, egg_filename, server, egg_filename, 'b')

            logging.debug('load model')
            tlo = server.load_model(egg_filename)

            logging.debug('set input')
            tlo.set('inp_0', 42)

            logging.debug('run')
            tlo.run()

            logging.debug('get output')
            output = tlo.get('out_0')
            self.assertEqual(output, 42)
        finally:
            os.remove(egg_filename)
            logging.debug('release')
            RAM.release(server)
Beispiel #8
0
    def test_allocator(self):
        logging.debug('')
        logging.debug('test_allocator')

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator='LocalHost'))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name,
                                           localhost=True)) # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy('test_server', {}, {})
        try:
            self.assertEqual(server.name, 'NAS_Allocator/test_server')
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, 'twisty', 'narrow', 'passages')
            self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages'))
            self.assertTrue(server.isdir('.'))
            self.assertEqual(sorted(server.listdir('.')),
                             ['openmdao_log.txt', 'stderr', 'stdout'])
        finally:
            self.allocator.release(server)
    def test_case_eval(self):
        logging.debug('')
        logging.debug('test_case_eval')

        # Run a fake job in style of CaseIteratorDriver.
        logging.debug('allocate server')
        server, server_info = RAM.allocate(dict(allocator=self.allocator.name))
        echo = set_as_top(Echo(1))
        egg_info = echo.save_to_egg('EchoTest', '1', need_requirements=False)
        egg_filename = egg_info[0]
        try:
            logging.debug('transfer egg')
            filexfer(None, egg_filename, server, egg_filename, 'b')

            logging.debug('load model')
            tlo = server.load_model(egg_filename)

            logging.debug('set input')
            tlo.set('inp_0', 42)

            logging.debug('run')
            tlo.run()

            logging.debug('get output')
            output = tlo.get('out_0')
            self.assertEqual(output, 42)
        finally:
            os.remove(egg_filename)
            logging.debug('release')
            RAM.release(server)
 def config_ram(self, filename):
     """
     Configure the :class:`ResourceAllocationManager` instance from `filename`.
     Used to define resources needed for model execution.
     """
     self._logger.debug('config_ram %r', filename)
     from openmdao.main.resource import ResourceAllocationManager
     ResourceAllocationManager.configure(filename)
    def test_hostnames(self):
        logging.debug('')
        logging.debug('test_hostnames')

        hostnames = ResourceAllocationManager.get_hostnames({'n_cpus':1})
        self.assertEqual(hostnames[0], platform.node())
        
        hostnames = ResourceAllocationManager.get_hostnames({'no_such_resource':1})
        self.assertEqual(hostnames, None)
    def test_hostnames(self):
        logging.debug('')
        logging.debug('test_hostnames')

        hostnames = RAM.get_hostnames({'min_cpus': 1})
        self.assertEqual(hostnames[0], platform.node())

        hostnames = RAM.get_hostnames({'allocator': 'LocalHost',
                                       'localhost': False})
        self.assertEqual(hostnames, None)
    def setUp(self):

        nodes = []
        for i in range(12):
            nodes.append('g-0%02d'%i)

        # start the fake MPI_Allocator 
        self.cluster=MPI_Allocator(name='test',machines=nodes)
        # add it to to the RAM
        RAM.add_allocator(self.cluster)
    def _service_loop(self, name, resource_desc, credentials, reply_q):
        """ Each server has an associated thread executing this. """
        set_credentials(credentials)

        server, server_info = RAM.allocate(resource_desc)
        # Just being defensive, this should never happen.
        if server is None:  # pragma no cover
            self._logger.error('Server allocation for %r failed :-(', name)
            reply_q.put((name, False, None))
            return
        else:
            # Clear egg re-use indicator.
            server_info['egg_file'] = None
            self._logger.debug('%r using %r', name, server_info['name'])
            if self._logger.level == logging.NOTSET:
                # By default avoid lots of protocol messages.
                server.set_log_level(logging.DEBUG)
            else:
                server.set_log_level(self._logger.level)

        request_q = Queue.Queue()

        try:
            with self._server_lock:
                sdata = self._servers[name]
                sdata.server = server
                sdata.info = server_info
                sdata.queue = request_q

            reply_q.put((name, True, None))  # ACK startup.

            while True:
                request = request_q.get()
                if request is None:
                    break
                try:
                    result = request[0](request[1])
                except Exception as req_exc:
                    self._logger.error('%r: %s caused %r', name,
                                       request[0], req_exc)
                    result = None
                else:
                    req_exc = None
                reply_q.put((name, result, req_exc))
        except Exception as exc:  # pragma no cover
            # This can easily happen if we take a long time to allocate and
            # we get 'cleaned-up' before we get started.
            if self._server_lock is not None:
                self._logger.error('%r: %r', name, exc)
        finally:
            self._logger.debug('%r releasing server', name)
            RAM.release(server)
            reply_q.put((name, True, None))  # ACK shutdown.
    def _execute_remote(self):
        """
        Allocate a server based on required resources, send inputs,
        run command, and retrieve results.
        """
        # Allocate server.
        self._server, server_info = RAM.allocate(self.resources)
        if self._server is None:
            self.raise_exception('Server allocation failed :-(', RuntimeError)

        return_code = -88888888
        error_msg = ''
        try:
            # Send inputs.
            patterns = []
            for metadata in self.external_files:
                if metadata.get('input', False):
                    patterns.append(metadata.path)
            if patterns:
                self._send_inputs(patterns)
            else:
                self._logger.debug("No input metadata paths")

            # Run command.
            self._logger.info("executing '%s'...", self.command)
            start_time = time.time()
            return_code, error_msg = \
                self._server.execute_command(self.command, self.stdin,
                                             self.stdout, self.stderr,
                                             self.env_vars, self.poll_delay,
                                             self.timeout)
            et = time.time() - start_time
            if et >= 60:  #pragma no cover
                self._logger.info('elapsed time: %f sec.', et)

            # Retrieve results.
            patterns = []
            for metadata in self.external_files:
                if metadata.get('output', False):
                    patterns.append(metadata.path)
            if patterns:
                self._retrieve_results(patterns)
            else:
                self._logger.debug("No output metadata paths")

        finally:
            RAM.release(self._server)
            self._server = None

        return (return_code, error_msg)
Beispiel #16
0
def rundlcs():
    """ 
    run the whole process, including startup and shutdown
    to do:
    parse input
    create load cases
    create app assembly
    create dispatcher
    send cases and app to dispatcher
    run cases
    collect and save output
    """


    options, arg = get_options()
    ctrl = parse_input(options)
    # ctrl will be just the input, but broken up into separate categories, e.g.
    # ctrl.cases, ctrl.app, ctrl.dispatch, ...

    # work in progress; running efficiently at NREL.
    if (options.cluster_allocator):
        from PeregrineClusterAllocator import ClusterAllocator
        cluster=ClusterAllocator()
        RAM.insert_allocator(0,cluster)

    ###  using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2)
    # Then we use these to create the cases...
    cases = create_run_cases(ctrl.cases, options)
    # and a turbine---never used this "stub"
#    turbine = create_turbine(ctrl.turbine)
    # and the appropriate wind code wrapper...
    aerocode = create_aerocode_wrapper(ctrl.aerocode, ctrl.output, options)
    # and the appropriate dispatcher...
    dispatcher = create_dlc_dispatcher(ctrl.dispatcher)
    ### After this point everything should be generic, all appropriate subclass object created
    # # # # # # # # # # #

    dispatcher.presetup_workflow(aerocode, cases)  # just makes sure parts are there when configure() is called
    dispatcher.configure()
    # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine.
    # calling configure() is done inside run(). but now it is done already (above), too.

    # norun does not write directories, but it does set us up to process them if they already exist
    if (not options.norun):
        print "calling run"
        dispatcher.run()

    # TODO:  more complexity will be needed for difference between "run now" and "run later" cases.
    dispatcher.collect_output(ctrl.output)
Beispiel #17
0
def rundlcs():
    """ 
    run the whole process, including startup and shutdown
    to do:
    parse input
    create load cases
    create app assembly
    create dispatcher
    send cases and app to dispatcher
    run cases
    collect and save output
    """
    

    options, arg = get_options()
    ctrl = parse_input(options.main_input, options)
    # ctrl will be just the input, but broken up into separate categories, e.g.
    # ctrl.cases, ctrl.app, ctrl.dispatch, ...

    if (options.cluster_allocator):
        cluster=ClusterAllocator()
        RAM.insert_allocator(0,cluster)
            
    ###  using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2)
    # Then we use these to create the cases...
    cases = create_load_cases(ctrl.cases, options)
    # and a turbine
    turbine = create_turbine(ctrl.turbine)
    # and the appropriate wind code wrapper...
    aerocode = create_aerocode_wrapper(ctrl.aerocode, options)
    # and the appropriate dispatcher...
    dispatcher = create_dlc_dispatcher(ctrl.dispatcher)
    ### After this point everything should be generic, all appropriate subclass object created
    
    dispatcher.presetup_workflow(aerocode, turbine, cases)  # just makes sure parts are there when configure() is called
    dispatcher.configure()
    # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine.
    # calling configure() is done inside run().
    
    if (not options.norun):
        dispatcher.run()

    # TODO:  more complexity will be needed for difference between "run now" and "run later" cases.
    dispatcher.collect_output(ctrl.output)
    sctx = sampler.Context()
    
    field_idx = 20   # = RootMyc1Std
    final_load_calc(sctx, "dlcproto.out", not dispatcher.raw_cases, field_idx)
def run_suite(resource_desc=None, name=None):
    """ Run suite of tests using `resource_desc` and resord under `name`. """
    resource_desc = resource_desc or {}
    name = name or ''
    print '\n%s' % name

    initial = 0.01
    limit = 20
    results = {}

    max_servers = ResourceAllocationManager.max_servers(resource_desc)
    print 'max servers', max_servers

    model = CID()
    model.driver.reload_model = False
    model.driver.sequential = False

    # Save to an egg to avoid analysis overhead during run_test().
    print '\nInitializing egg module analysis'
    template = Case(inputs=[('sleeper.delay', None, 0.01)])
    model.driver.iterator = Iterator(template)
    model.driver.recorders = [Recorder(model.driver.iterator, 1000)]
    start = time.time()
    egg_filename, required_distributions, orphan_modules = \
        model.save_to_egg('caseperf', '0')
    et = time.time() - start
    print '    done in %.2f' % et
    os.remove(egg_filename)

    print
    results = run_test(model, initial, limit, max_servers)
    record_results(results, name)
    def setUp(self):
        self.user = getpass.getuser()
        self.node = platform.node()
        self.name = self.node.replace('.', '_')
        self.python = find_python()
        self.cluster = None

        if sys.platform == 'win32' or self.user not in SSH_USERS:
            self.skip_ssh = True
        else:
            self.skip_ssh = False

        self.machines = []
        if self.node.startswith('gxterm'):
            # User environment assumed OK on this GRC cluster front-end.
            for i in range(1, 55):
                self.machines.append({'hostname':'gx%02d' % i,
                                      'python':self.python})
        else:
            self.machines.append({'hostname':self.node,
                                  'python':self.python})

        # Ensure we aren't held up by local host load problems.
        for allocator in ResourceAllocationManager.list_allocators():
            if allocator.name == 'LocalHost':
                self.local = allocator
                self.local.max_load = 10
                break
        else:
            raise RuntimeError('No LocalHost allocator!?')
 def get_ram(self):
     """
     Returns the :class:`ResourceAllocationManager` instance.
     Used by :meth:`ResourceAllocationManager.add_remotes`.
     """
     from openmdao.main.resource import ResourceAllocationManager
     return ResourceAllocationManager._get_instance()
def setup_cluster(encrypted=True):
    """ Use openmdao.testing.cluster.init_cluster, but fix 'max_load'. """
    name = init_cluster(encrypted, allow_shell=True)
    for allocator in ResourceAllocationManager.list_allocators():
        if allocator.name == 'LocalHost':
            allocator.max_load = 1.
    return name
 def get_ram(self):
     """
     Returns the :class:`ResourceAllocationManager` instance.
     Used by :meth:`ResourceAllocationManager.add_remotes`.
     """
     from openmdao.main.resource import ResourceAllocationManager
     return ResourceAllocationManager.get_instance()
Beispiel #23
0
def InitializeCluster(hostnames, pydir, identity_filename=None):
    print 'Connecting to cluster...'
    machines = []

    for host in hostnames:
        machines.append(ClusterHost(
            hostname=host,
            python = pydir,
            tunnel_incoming=True, tunnel_outgoing=True,
            identity_filename=identity_filename))
    
    _SSH.extend(['-o', 'StrictHostKeyChecking=no'])   #somewhat dangerous, this automatically adds the host key to known_hosts
    cluster = ClusterAllocator('PCCCluster', machines, allow_shell=True)
    RAM.insert_allocator(0, cluster)
    print 'Servers connected on cluster:',cluster.max_servers({})[0]
    global UseCluster
    UseCluster = True
    def tearDown(self):
        try:
            logging.debug("remove")
            RAM.remove_allocator(self.allocator.name)

            if self.proc is not None:
                logging.debug("shutdown")
                self.allocator.shutdown()
                self.proc.terminate()
            else:
                self.allocator.invalidate()

            # Restore 'ssh' and 'scp' configuration.
            protocol.configure_ssh(self.orig_ssh)
            protocol.configure_scp(self.orig_scp)

            time.sleep(2)
            for name in (_RJE_ROOT, _DMZ_ROOT):
                if os.path.exists(name):
                    shutil.rmtree(name)
        finally:
            os.chdir(self.orig_dir)
Beispiel #25
0
    def tearDown(self):
        try:
            logging.debug('remove')
            RAM.remove_allocator(self.allocator.name)

            if self.proc is not None:
                logging.debug('shutdown')
                self.allocator.shutdown()
                self.proc.terminate()
            else:
                self.allocator.invalidate()

            # Restore 'ssh' and 'scp' configuration.
            protocol.configure_ssh(self.orig_ssh)
            protocol.configure_scp(self.orig_scp)

            time.sleep(2)
            for name in (_RJE_ROOT, _DMZ_ROOT):
                if os.path.exists(name):
                    shutil.rmtree(name)
        finally:
            os.chdir(self.orig_dir)
    def test_allocator(self):
        logging.debug("")
        logging.debug("test_allocator")

        # Since we're faking it with a remote LocalHost, we should match.
        local_servers = RAM.max_servers(dict(allocator="LocalHost"))
        max_servers = RAM.max_servers(dict(allocator=self.allocator.name))
        self.assertEqual(max_servers, local_servers)

        max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True))  # Contradictory!
        self.assertEqual(max_servers, 0)

        server = self.allocator.deploy("test_server", {}, {})
        try:
            self.assertEqual(server.name, "NAS_Allocator/test_server")
            self.assertEqual(server.host, socket.gethostname())
            self.assertTrue(server.pid > 0)
            retval = server.echo(123, "twisty", "narrow", "passages")
            self.assertEqual(retval, (123, "twisty", "narrow", "passages"))
            self.assertTrue(server.isdir("."))
            self.assertEqual(sorted(server.listdir(".")), ["openmdao_log.txt", "stderr", "stdout"])
        finally:
            self.allocator.release(server)
def init_cluster(encrypted=True, clean_dir=True, allow_shell=False):
    """
    If not already done, initializes the ResourceAllocationManager and
    adds a cluster using encrypted or unencrypted communication.
    Returns the name of the configured cluster.
    """
    authkey = 'PublicKey' if encrypted else 'AuthKey'
    allocators = ResourceAllocationManager.list_allocators()

    if len(allocators) == 1:
        local = ResourceAllocationManager.get_allocator(0)
        if local.max_load < 10:  # First time we've been called.
            # Ensure we aren't held up by local host load problems.
            local.max_load = 10

            if clean_dir:
                # Remove any local allocator-created directories.
                for path in glob.glob('Sim-*'):
                    shutil.rmtree(path, onerror=onerror)

    node = platform.node()
    name = '%s_%s' % (node.replace('.', '_'), authkey)
    for allocator in allocators:
        if allocator.name == name:
            return name  # Don't add multiple copies.

    machines = []
    python = sys.executable

    if node.startswith('gxterm'):
        # User environment assumed OK on this GRC cluster front-end.
        # Using less than full machine (55 nodes) to allow multiple
        # cluster testing without hitting limit on open files (sockets).
        for i in range(20):
            machines.append({'hostname': 'gx%02d' % i, 'python': python})
    elif local_ssh_available():
        machines.append({'hostname': node, 'python': python})

    if machines:
        cluster = ClusterAllocator(name, machines, authkey, allow_shell)
        ResourceAllocationManager.insert_allocator(0, cluster)
        return name
    elif not encrypted:
        # Create a LocalAllocator so we have *something*.
        name = 'LocalUnencrypted'
        for allocator in allocators:
            if allocator.name == name:
                return name  # Don't add multiple copies.
        local = LocalAllocator(name, authkey=authkey, allow_shell=allow_shell)
        ResourceAllocationManager.insert_allocator(0, local)
        return name
    return None
    def setup(self, replicate=True):
        """
        Setup to begin new run.

        replicate: bool
             If True, then replicate the model and save to an egg file
             first (for concurrent evaluation).
        """
        self._cleanup(remove_egg=replicate)

        if not self.sequential:
            if replicate or self._egg_file is None:
                # Save model to egg.
                # Must do this before creating any locks or queues.
                self._replicants += 1
                version = 'replicant.%d' % (self._replicants)

                # If only local host will be used, we can skip determining
                # distributions required by the egg.
                allocators = RAM.list_allocators()
                need_reqs = False
                if not self.ignore_egg_requirements:
                    for allocator in allocators:
                        if not isinstance(allocator, LocalAllocator):
                            need_reqs = True
                            break

                driver = self.parent.driver
                self.parent.add('driver', Driver()) # this driver will execute the workflow once
                self.parent.driver.workflow = self.workflow
                try:
                    #egg_info = self.model.save_to_egg(self.model.name, version)
                    # FIXME: what name should we give to the egg?
                    egg_info = self.parent.save_to_egg(self.name, version,
                                                    need_requirements=need_reqs)
                finally:
                    self.parent.driver = driver

                self._egg_file = egg_info[0]
                self._egg_required_distributions = egg_info[1]
                self._egg_orphan_modules = [name for name, path in egg_info[2]]

        self._iter = self.get_case_iterator()
        self._seqno = 0
Beispiel #29
0
    def run_serial(self):
        """
        Run serial version of ADPAC. Runs on remote host if there's more
        than just the local allocator.
        """
        try:
            allocator = RAM.get_allocator(1)
        except IndexError:
            self.resources = {}
        else:
            self.resources = {'n_cpus': 1}

        self.command = [self.serial_adpac]
        if not self.idissf:
            self.command.append('-d')
        if self.irevs:
            self.command.append('-r')

        self.stdin  = self.input.casename+'.input'
        self.stdout = self.input.casename+'.output'
        self.stderr = ExternalCode.STDOUT
        super(ADPAC, self).execute()
    def setUp(self):
        self.user = getpass.getuser()
        self.node = platform.node()
        self.name = self.node.replace('.', '_')
        self.python = find_python()
        self.cluster = None

        if sys.platform == 'win32' or self.user not in SSH_USERS:
            self.skip_ssh = True
        else:
            self.skip_ssh = False

        self.machines = []
        self.machines.append({'hostname':self.node,
                              'python':self.python})

        # Ensure we aren't held up by local host load problems.
        for allocator in ResourceAllocationManager.list_allocators():
            if allocator.name == 'LocalHost':
                self.local = allocator
                self.local.max_load = 10
                break
        else:
            raise RuntimeError('No LocalHost allocator!?')
    def test_resources(self):
        logging.debug('')
        logging.debug('test_resources')

        result = ResourceAllocationManager.allocate({'localhost':False})
        self.assertEqual(result, (None, None))

        result = ResourceAllocationManager.allocate({'exclude':[platform.node()]})
        self.assertEqual(result, (None, None))

        result = ResourceAllocationManager.allocate({'n_cpus':1000000})
        self.assertEqual(result, (None, None))

        result = ResourceAllocationManager.allocate({'orphan_modules':['xyzzy']})
        self.assertEqual(result, (None, None))

        result = ResourceAllocationManager.allocate({'python_version':'xyzzy'})
        self.assertEqual(result, (None, None))

        result = ResourceAllocationManager.allocate({'xyzzy':None})
        self.assertEqual(result, (None, None))
def main():  #pragma no cover
    """
    OpenMDAO factory service process.

    Usage: python objserverfactory.py [--allow-public][--allow-shell][--hosts=filename][--types=filename][--users=filename][--address=address][--port=number][--prefix=name][--tunnel][--resources=filename][--log-host=hostname][--log-port=number][--log-prefix=string]

    --allow-public:
        Allows access by anyone from any allowed host. Use with care!

    --allow-shell:
        Allows access to :meth:`execute_command` and :meth:`load_model`.
        Use with care!

    --hosts: string
        Filename for allowed hosts specification. Default ``hosts.allow``.
        Ignored if '--users' is specified.
        The file should contain IPv4 host addresses, IPv4 domain addresses,
        or hostnames, one per line. Blank lines are ignored, and '#' marks the
        start of a comment which continues to the end of the line.
        For security reasons this file must be accessible only by the user
        running this server.

    --types: string
        Filename for allowed types specification.
        If not specified then allow types listed by
        :meth:`factorymanager.get_available_types`.
        The file should contain one type name per line.

    --users: string
        Filename for allowed users specification.
        Ignored if '--allow-public' is specified.
        Default is ``~/.ssh/authorized_keys``, other files should be of the
        same format: each line has ``key-type public-key-data user@host``,
        where `user` is the username on `host`. `host` will be translated to an
        IPv4 address and included in the allowed hosts list.
        Note that this ``user@host`` form is not necessarily enforced by
        programs which generate keys.
        For security reasons this file must be accessible only by the user
        running this server.

    --address: string
        IPv4 address, hostname, or pipe name.
        Default is the host's default IPv4 address.

    --port: int
        Server port (default of 0 implies next available port).
        Note that ports below 1024 typically require special privileges.
        If port is negative, then a local pipe is used for communication.

    --prefix: string
        Prefix for configuration and stdout/stderr files (default ``server``).

    --tunnel:
        Report host IP address but listen for connections from a local
        SSH tunnel.

    --resources: string
        Filename for resource configuration. If not specified then the
        default of ``~/.openmdao/resources.cfg`` will be used.

    --log-host: string
        Hostname to send remote log messages to.

    --log-port: int
        Port on `log-host` to send remote log messages to.

    --log-prefix: string
        Prefix to apply to remote log messages. Default is ``pid@host``.

    If ``prefix.key`` exists, it is read for an authorization key string.
    Otherwise public key authorization and encryption is used.

    Allowed hosts *must* be specified if `port` is >= 0. Only allowed hosts
    may connect to the server.

    Once initialized ``prefix.cfg`` is written with address, port, and
    public key information.
    """
    parser = optparse.OptionParser()
    parser.add_option('--address', action='store', type='str',
                      help='Network address to serve.')
    parser.add_option('--allow-public', action='store_true', default=False,
                      help='Allows access by any user, use with care!')
    parser.add_option('--allow-shell', action='store_true', default=False,
                      help='Allows potential shell access, use with care!')
    parser.add_option('--hosts', action='store', type='str',
                      default='hosts.allow', help='Filename for allowed hosts')
    parser.add_option('--types', action='store', type='str',
                      help='Filename for allowed types')
    parser.add_option('--users', action='store', type='str',
                      default='~/.ssh/authorized_keys',
                      help='Filename for allowed users')
    parser.add_option('--port', action='store', type='int', default=0,
                      help='Server port (0 implies next available port)')
    parser.add_option('--prefix', action='store', default='server',
                      help='Prefix for config and stdout/stderr files')
    parser.add_option('--tunnel', action='store_true', default=False,
                      help='Report host IP address but listen for connections'
                           ' from a local SSH tunnel')
    parser.add_option('--resources', action='store', type='str',
                      default=None, help='Filename for resource configuration')
    parser.add_option('--log-host', action='store', type='str',
                      default=None, help='hostname for remote log messages')
    parser.add_option('--log-port', action='store', type='int',
                      default=None, help='port for remote log messages')
    parser.add_option('--log-prefix', action='store', type='str',
                      default=None, help='prefix for remote log messages')

    options, arguments = parser.parse_args()
    if arguments:
        parser.print_help()
        sys.exit(1)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    if options.log_host and options.log_port:
        install_remote_handler(options.log_host, int(options.log_port),
                               options.log_prefix)

    server_key = options.prefix+'.key'
    server_cfg = options.prefix+'.cfg'
    global _SERVER_CFG
    _SERVER_CFG = server_cfg

    # Get authkey.
    authkey = 'PublicKey'
    try:
        with open(server_key, 'r') as inp:
            authkey = inp.readline().strip()
        os.remove(server_key)
    except IOError:
        pass

    if options.allow_shell:
        msg = 'Shell access is ALLOWED'
        logger.warning(msg)
        print msg

    allowed_users = None
    allowed_hosts = None

    # Get allowed_users.
    if options.allow_public:
        allowed_users = None
        msg = 'Public access is ALLOWED'
        logger.warning(msg)
        print msg

        if options.port >= 0:
            # Get allowed_hosts.
            if os.path.exists(options.hosts):
                try:
                    allowed_hosts = read_allowed_hosts(options.hosts)
                except Exception as exc:
                    msg = "Can't read allowed hosts file %r: %s" \
                          % (options.hosts, exc)
                    logger.error(msg)
                    print msg
                    sys.exit(1)
            else:
                msg = 'Allowed hosts file %r does not exist.' % options.hosts
                logger.error(msg)
                print msg
                sys.exit(1)

            if not allowed_hosts:
                msg = 'No hosts in allowed hosts file %r.' % options.hosts
                logger.error(msg)
                print msg
                sys.exit(1)
    else:
        if os.path.exists(options.users):
            try:
                allowed_users = read_authorized_keys(options.users, logger)
            except Exception as exc:
                msg = "Can't read allowed users file %r: %s" \
                      % (options.users, exc)
                logger.error(msg)
                print msg
                sys.exit(1)
        else:
            msg = 'Allowed users file %r does not exist.' % options.users
            logger.error(msg)
            print msg
            sys.exit(1)

        if not allowed_users:
            msg = 'No users in allowed users file %r.' % options.users
            logger.error(msg)
            print msg
            sys.exit(1)

    # Get allowed_types.
    allowed_types = None
    if options.types:
        if os.path.exists(options.types):
            allowed_types = []
            with open(options.types, 'r') as inp:
                line = inp.readline()
                while line:
                    line = line.strip()
                    if line:
                        allowed_types.append(line)
                    line = inp.readline()
        else:
            msg = 'Allowed types file %r does not exist.' % options.types
            logger.error(msg)
            print msg
            sys.exit(1)

    # Optionally configure resources.
    if options.resources:
        # Import here to avoid import loop.
        from openmdao.main.resource import ResourceAllocationManager as RAM
        RAM.configure(options.resources)

    # Get address and create manager.
    if options.port >= 0:
        if options.address:  # Specify IPv4/hostname.
            address = (options.address, options.port)
        else:
            address = (platform.node(), options.port)
    else:
        if options.address:  # Specify pipename.
            address = options.address
        else:
            address = None

    logger.info('Starting FactoryManager %s %r', address, keytype(authkey))
    current_process().authkey = authkey
    bind_address = ('127.0.0.1', options.port) if options.tunnel else address
    manager = _FactoryManager(bind_address, authkey, name='Factory',
                              allowed_hosts=allowed_hosts,
                              allowed_users=allowed_users,
                              allow_tunneling=options.tunnel)

    # Set defaults for created ObjServerFactories.
    # There isn't a good method to propagate these through the manager.
    ObjServerFactory._address = address
    ObjServerFactory._allow_shell = options.allow_shell
    ObjServerFactory._allowed_types = allowed_types
    ObjServerFactory._allow_tunneling = options.tunnel

    # Get server, retry if specified address is in use.
    server = None
    retries = 0
    while server is None:
        try:
            server = manager.get_server()
        except socket.error as exc:
            if str(exc).find('Address already in use') >= 0:
                if retries < 10:
                    msg = 'Address %s in use, retrying...' % (address,)
                    logger.debug(msg)
                    print msg
                    time.sleep(5)
                    retries += 1
                else:
                    msg = 'Address %s in use, too many retries.' % (address,)
                    logger.error(msg)
                    print msg
                    sys.exit(1)
            else:
                raise

    # Record configuration.
    real_ip = None if address is None else address[0]
    write_server_config(server, _SERVER_CFG, real_ip)
    msg = 'Serving on %s' % (server.address,)
    logger.info(msg)
    print msg
    sys.stdout.flush()

    # And away we go...
    signal.signal(signal.SIGTERM, _sigterm_handler)
    try:
        server.serve_forever()
    finally:
        _cleanup()
    sys.exit(0)
    def _execute_remote(self):
        """
        Allocate a server based on required resources, send inputs,
        run command, and retrieve results.
        """
        rdesc = self.resources.copy()

        # Allocate server.
        self._server, server_info = RAM.allocate(rdesc)
        if self._server is None:
            self.raise_exception('Server allocation failed :-(', RuntimeError)

        if self._logger.level == logging.NOTSET:
            # By default avoid lots of protocol messages.
            self._server.set_log_level(logging.DEBUG)
        else:
            self._server.set_log_level(self._logger.level)

        return_code = -88888888
        error_msg = ''
        try:
            # Create resource description for command.
            rdesc['job_name'] = self.get_pathname()
            rdesc['remote_command'] = self.command[0]
            if len(self.command) > 1:
                rdesc['args'] = self.command[1:]
            if self.env_vars:
                rdesc['job_environment'] = self.env_vars
            if not self.stdin:
                self.raise_exception('Remote execution requires stdin of'
                                     ' DEV_NULL or filename, got %r'
                                     % self.stdin, ValueError)
            if self.stdin != self.DEV_NULL:
                rdesc['input_path'] = self.stdin
            if self.stdout:
                rdesc['output_path'] = self.stdout
            else:
                rdesc['output_path'] = '%s.stdout' % self.command[0]
            if self.stderr:
                if self.stderr == self.STDOUT:
                    rdesc['join_files'] = True
                else:
                    rdesc['error_path'] = self.stderr
            else:
                rdesc['error_path'] = '%s.stderr' % self.command[0]
            if self.timeout:
                if 'resource_limits' in rdesc:
                    limits = rdesc['resource_limits'].copy()
                else:
                    limits = {}
                limits['wallclock_time'] = self.timeout
                rdesc['resource_limits'] = limits

            # Send inputs.
            patterns = []
            textfiles = []
            for metadata in self.external_files:
                if metadata.get('input', False):
                    patterns.append(metadata.path)
                    if not metadata.binary:
                        textfiles.append(metadata.path)
            for pathname, obj in self.items(iotype='in', recurse=True):
                if isinstance(obj, FileRef):
                    local_path = self.get_metadata(pathname, 'local_path')
                    if local_path:
                        patterns.append(local_path)
                        if not obj.binary:
                            textfiles.append(local_path)
            if self.stdin and self.stdin != self.DEV_NULL:
                patterns.append(self.stdin)
                textfiles.append(self.stdin)
            if patterns:
                self._send_inputs(patterns, textfiles)
            else:
                self._logger.debug('No input files')

            # Run command.
            self._logger.info('executing %s...', self.command)
            start_time = time.time()
            return_code, error_msg = \
                self._server.execute_command(rdesc)
            et = time.time() - start_time
            if et >= 60:  #pragma no cover
                self._logger.info('elapsed time: %.1f sec.', et)

            # Retrieve results.
            patterns = []
            textfiles = []
            for metadata in self.external_files:
                if metadata.get('output', False):
                    patterns.append(metadata.path)
                    if not metadata.binary:
                        textfiles.append(metadata.path)
            for pathname, obj in self.items(iotype='out', recurse=True):
                if isinstance(obj, FileRef):
                    patterns.append(obj.path)
                    if not obj.binary:
                        textfiles.append(obj.path)
            patterns.append(rdesc['output_path'])
            textfiles.append(rdesc['output_path'])
            if self.stderr != self.STDOUT:
                patterns.append(rdesc['error_path'])
                textfiles.append(rdesc['error_path'])
            self._retrieve_results(patterns, textfiles)

            # Echo stdout if not redirected.
            if not self.stdout:
                name = rdesc['output_path']
                if os.path.exists(name):
                    with open(name, 'rU') as inp:
                        sys.stdout.write(inp.read())
                    os.remove(name)
                else:
                    sys.stdout.write('\n[No stdout available]\n')

            # Echo stderr if not redirected.
            if not self.stderr:
                name = rdesc['error_path']
                if os.path.exists(name):
                    with open(name, 'rU') as inp:
                        sys.stderr.write(inp.read())
                    os.remove(name)
                else:
                    sys.stdout.write('\n[No stderr available]\n')
        finally:
            RAM.release(self._server)
            self._server = None

        return (return_code, error_msg)
    def _start(self):
        """ Start evaluating cases concurrently. """
        # Need credentials in case we're using a PublicKey server.
        credentials = get_credentials()

        # Determine maximum number of servers available.
        resources = {
            'required_distributions':self._egg_required_distributions,
            'orphan_modules':self._egg_orphan_modules,
            'python_version':sys.version[:3]}
        if self.extra_resources:
            resources.update(self.extra_resources)
        max_servers = RAM.max_servers(resources)
        self._logger.debug('max_servers %d', max_servers)
        if max_servers <= 0:
            msg = 'No servers supporting required resources %s' % resources
            self.raise_exception(msg, RuntimeError)

        # Kick off initial wave of cases.
        self._server_lock = threading.Lock()
        self._reply_q = Queue.Queue()
        self._generation += 1
        n_servers = 0
        while n_servers < max_servers:
            if not self._more_to_go():
                break

            # Get next case. Limits servers started if max_servers > cases.
            try:
                case = self._iter.next()
            except StopIteration:
                if not self._rerun:
                    self._iter = None
                    self._seqno = 0
                    break

            self._seqno += 1
            self._todo.append((case, self._seqno))

            # Start server worker thread.
            n_servers += 1
            name = '%s_%d_%d' % (self.name, self._generation, n_servers)
            self._logger.debug('starting worker for %r', name)
            self._servers[name] = None
            self._in_use[name] = True
            self._server_cases[name] = None
            self._server_states[name] = _EMPTY
            self._load_failures[name] = 0
            server_thread = threading.Thread(target=self._service_loop,
                                             args=(name, resources,
                                                   credentials, self._reply_q))
            server_thread.daemon = True
            try:
                server_thread.start()
            except thread.error:
                self._logger.warning('worker thread startup failed for %r',
                                     name)
                self._in_use[name] = False
                break

            if sys.platform != 'win32':
                # Process any pending events.
                while self._busy():
                    try:
                        name, result, exc = self._reply_q.get(True, 0.01)
                    except Queue.Empty:
                        break  # Timeout.
                    else:
                        # Difficult to force startup failure.
                        if self._servers[name] is None:  #pragma nocover
                            self._logger.debug('server startup failed for %r',
                                               name)
                            self._in_use[name] = False
                        else:
                            self._in_use[name] = self._server_ready(name)

        if sys.platform == 'win32':  #pragma no cover
            # Don't start server processing until all servers are started,
            # otherwise we have egg removal issues.
            for name in self._in_use.keys():
                name, result, exc = self._reply_q.get()
                if self._servers[name] is None:
                    self._logger.debug('server startup failed for %r', name)
                    self._in_use[name] = False

            # Kick-off started servers.
            for name in self._in_use.keys():
                if self._in_use[name]:
                    self._in_use[name] = self._server_ready(name)

        # Continue until no servers are busy.
        while self._busy():
            if self._more_to_go():
                timeout = None
            else:
                # Don't wait indefinitely for a server we don't need.
                # This has happened with a server that got 'lost'
                # in RAM.allocate()
                timeout = 60
            try:
                name, result, exc = self._reply_q.get(timeout=timeout)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                msgs = []
                for name, in_use in self._in_use.items():
                    if in_use:
                        try:
                            server = self._servers[name]
                            info = self._server_info[name]
                        except KeyError:
                            msgs.append('%r: no startup reply' % name)
                            self._in_use[name] = False
                        else:
                            state = self._server_states[name]
                            if state not in (_LOADING, _EXECUTING):
                                msgs.append('%r: %r %s %s'
                                            % (name, self._servers[name],
                                               state, self._server_info[name]))
                                self._in_use[name] = False
                if msgs:
                    self._logger.error('Timeout waiting with nothing left to do:')
                    for msg in msgs:
                        self._logger.error('    %s', msg)
            else:
                self._in_use[name] = self._server_ready(name)

        # Shut-down (started) servers.
        self._logger.debug('Shut-down (started) servers')
        for queue in self._queues.values():
            queue.put(None)
        for i in range(len(self._queues)):
            try:
                name, status, exc = self._reply_q.get(True, 60)
            # Hard to force worker to hang, which is handled here.
            except Queue.Empty:  #pragma no cover
                pass
            else:
                if name in self._queues:  # 'Stale' worker can reply *late*.
                    del self._queues[name]
        # Hard to force worker to hang, which is handled here.
        for name in self._queues.keys():  #pragma no cover
            self._logger.warning('Timeout waiting for %r to shut-down.', name)
    def _setup(self):
        """ Setup to begin new run. """
        if not self.sequential:
            # Save model to egg.
            # Must do this before creating any locks or queues.
            self._replicants += 1
            version = 'replicant.%d' % (self._replicants)

            # If only local host will be used, we can skip determining
            # distributions required by the egg.
            allocators = RAM.list_allocators()
            need_reqs = False
            if not self.ignore_egg_requirements:
                for allocator in allocators:
                    if not isinstance(allocator, LocalAllocator):
                        need_reqs = True
                        break

            # Replicate and mutate model to run our workflow once.
            # Originally this was done in-place, but that 'invalidated'
            # various workflow quantities.
            replicant = self.parent.copy()
            workflow = replicant.get(self.name+'.workflow')
            driver = replicant.add('driver', Driver())
            workflow.parent = driver
            workflow.scope = None
            replicant.driver.workflow = workflow
            egg_info = replicant.save_to_egg(self.name, version,
                                             need_requirements=need_reqs)
            replicant = workflow = driver = None  # Release objects.
            gc.collect()  # Collect/compact before possible fork.

            self._egg_file = egg_info[0]
            self._egg_required_distributions = egg_info[1]
            self._egg_orphan_modules = [name for name, path in egg_info[2]]

        inp_paths = []
        inp_values = []
        for path, param in self.get_parameters().items():
            if isinstance(path, tuple):
                path = path[0]  # Use first target of ParameterGroup.
            path = make_legal_path(path)
            value = self.get('case_inputs.'+path)
            for target in param.targets:
                inp_paths.append(target)
                inp_values.append(value)

        outputs = self.get_responses().keys()
        extra_outputs = self.workflow._rec_outputs

        length = len(inp_values[0]) if inp_values else 0
        cases = []
        for i in range(length):
            inputs = []
            for j in range(len(inp_paths)):
                inputs.append((inp_paths[j], inp_values[j][i]))
            cases.append(_Case(i, inputs, outputs, extra_outputs,
                               parent_uuid=self._case_uuid))
        self.init_responses(length)

        self._iter = iter(cases)
        self._abort_exc = None