def setUp(self): self.orig_dir = os.getcwd() os.chdir(TestCase.directory) try: # Force use of fake 'ssh' and 'scp'. ssh = ("python", os.path.join(_TST_ROOT, "ssh.py"), _DMZ_ROOT) scp = ("python", os.path.join(_TST_ROOT, "scp.py"), _DMZ_ROOT) self.orig_ssh = protocol.configure_ssh(ssh) self.orig_scp = protocol.configure_scp(scp) # Avoid lots of polling log entries. if logging.getLogger().getEffectiveLevel() < logging.DEBUG: logging.getLogger().setLevel(logging.DEBUG) # Start RJE server. hostname = socket.gethostname() self.proc = start_server(hostname) # Create NAS_Allocator referring to server. logging.debug("create allocator") self.allocator = NAS_Allocator() parser = ConfigParser.ConfigParser() section = self.allocator.name parser.add_section(section) parser.set(section, "dmz_host", hostname) parser.set(section, "server_host", hostname) self.allocator.configure(parser) # Add allocator to RAM. RAM.add_allocator(self.allocator) except Exception: os.chdir(self.orig_dir) raise
def test_allocator(self): logging.debug('') logging.debug('test_allocator') # Since we're faking it with a remote LocalHost, we should match. local_servers = RAM.max_servers(dict(allocator='LocalHost')) max_servers = RAM.max_servers(dict(allocator=self.allocator.name)) self.assertEqual(max_servers, local_servers) max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True)) # Contradictory! self.assertEqual(max_servers, 0) server = self.allocator.deploy('test_server', {}, {}) try: self.assertEqual(server.name, 'NAS_Allocator/test_server') self.assertEqual(server.host, socket.gethostname()) self.assertTrue(server.pid > 0) retval = server.echo(123, 'twisty', 'narrow', 'passages') self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages')) self.assertTrue(server.isdir('.')) self.assertEqual(sorted(server.listdir('.')), ['openmdao_log.txt', 'stderr', 'stdout']) finally: self.allocator.release(server)
def test_case_eval(self): logging.debug("") logging.debug("test_case_eval") # Run a fake job in style of CaseIteratorDriver. logging.debug("allocate server") server, server_info = RAM.allocate(dict(allocator=self.allocator.name)) echo = set_as_top(Echo(1)) egg_info = echo.save_to_egg("EchoTest", "1", need_requirements=False) egg_filename = egg_info[0] try: logging.debug("transfer egg") filexfer(None, egg_filename, server, egg_filename, "b") logging.debug("load model") tlo = server.load_model(egg_filename) logging.debug("set input") tlo.set("inp_0", 42) logging.debug("run") tlo.run() logging.debug("get output") output = tlo.get("out_0") self.assertEqual(output, 42) finally: os.remove(egg_filename) logging.debug("release") RAM.release(server)
def setUp(self): # Save existing RAM instance and force a rebuild. self.orig_ram = RAM._RAM RAM._RAM = None RAM.configure('') self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] self.machines.append({'hostname': self.node, 'python': self.python}) # Ensure we aren't held up by local host load problems. for allocator in RAM.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?')
def test_request(self): logging.debug('') logging.debug('test_request') assembly = Assembly() comp1 = assembly.add('comp1', ExtCode()) comp2 = assembly.add('comp2', ExtCode()) sub = assembly.add('sub', Assembly()) comp3 = sub.add('comp3', ExtCode()) comp1.resources = dict(min_cpus=10, max_cpus=10, resource_limits=dict(virtual_memory=100, cpu_time=120), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') comp2.resources = dict(max_cpus=2, resource_limits=dict(wallclock_time=1000000)) comp3.resources = dict(min_cpus=200, resource_limits=dict(virtual_memory=20, cpu_time=1000, wallclock_time=500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') req = RAM.max_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1000, wallclock_time=1000000)) logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) req = RAM.total_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1120, wallclock_time=1000500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) comp3.resources['accounting_id'] = 'xyzzy' assert_raises(self, 'RAM.total_request(assembly)', globals(), locals(), ValueError, "Incompatible settings for 'accounting_id':" " 'xyzzy' vs. 'frobozz'")
def setUp(self): self.orig_dir = os.getcwd() os.chdir(TestCase.directory) try: # Force use of fake 'ssh' and 'scp'. ssh = ('python', os.path.join(_TST_ROOT, 'ssh.py'), _DMZ_ROOT) scp = ('python', os.path.join(_TST_ROOT, 'scp.py'), _DMZ_ROOT) self.orig_ssh = protocol.configure_ssh(ssh) self.orig_scp = protocol.configure_scp(scp) # Avoid lots of polling log entries. if logging.getLogger().getEffectiveLevel() < logging.DEBUG: logging.getLogger().setLevel(logging.DEBUG) # Start RJE server. hostname = socket.gethostname() self.proc = start_server(hostname) # Create NAS_Allocator referring to server. logging.debug('create allocator') self.allocator = NAS_Allocator() parser = ConfigParser.ConfigParser() section = self.allocator.name parser.add_section(section) parser.set(section, 'dmz_host', hostname) parser.set(section, 'server_host', hostname) self.allocator.configure(parser) # Add allocator to RAM. RAM.add_allocator(self.allocator) except Exception: os.chdir(self.orig_dir) raise
def test_case_eval(self): logging.debug('') logging.debug('test_case_eval') # Run a fake job in style of CaseIteratorDriver. logging.debug('allocate server') server, server_info = RAM.allocate(dict(allocator=self.allocator.name)) echo = set_as_top(Echo(1)) egg_info = echo.save_to_egg('EchoTest', '1', need_requirements=False) egg_filename = egg_info[0] try: logging.debug('transfer egg') filexfer(None, egg_filename, server, egg_filename, 'b') logging.debug('load model') tlo = server.load_model(egg_filename) logging.debug('set input') tlo.set('inp_0', 42) logging.debug('run') tlo.run() logging.debug('get output') output = tlo.get('out_0') self.assertEqual(output, 42) finally: os.remove(egg_filename) logging.debug('release') RAM.release(server)
def config_ram(self, filename): """ Configure the :class:`ResourceAllocationManager` instance from `filename`. Used to define resources needed for model execution. """ self._logger.debug('config_ram %r', filename) from openmdao.main.resource import ResourceAllocationManager ResourceAllocationManager.configure(filename)
def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = ResourceAllocationManager.get_hostnames({'n_cpus':1}) self.assertEqual(hostnames[0], platform.node()) hostnames = ResourceAllocationManager.get_hostnames({'no_such_resource':1}) self.assertEqual(hostnames, None)
def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = RAM.get_hostnames({'min_cpus': 1}) self.assertEqual(hostnames[0], platform.node()) hostnames = RAM.get_hostnames({'allocator': 'LocalHost', 'localhost': False}) self.assertEqual(hostnames, None)
def setUp(self): nodes = [] for i in range(12): nodes.append('g-0%02d'%i) # start the fake MPI_Allocator self.cluster=MPI_Allocator(name='test',machines=nodes) # add it to to the RAM RAM.add_allocator(self.cluster)
def _service_loop(self, name, resource_desc, credentials, reply_q): """ Each server has an associated thread executing this. """ set_credentials(credentials) server, server_info = RAM.allocate(resource_desc) # Just being defensive, this should never happen. if server is None: # pragma no cover self._logger.error('Server allocation for %r failed :-(', name) reply_q.put((name, False, None)) return else: # Clear egg re-use indicator. server_info['egg_file'] = None self._logger.debug('%r using %r', name, server_info['name']) if self._logger.level == logging.NOTSET: # By default avoid lots of protocol messages. server.set_log_level(logging.DEBUG) else: server.set_log_level(self._logger.level) request_q = Queue.Queue() try: with self._server_lock: sdata = self._servers[name] sdata.server = server sdata.info = server_info sdata.queue = request_q reply_q.put((name, True, None)) # ACK startup. while True: request = request_q.get() if request is None: break try: result = request[0](request[1]) except Exception as req_exc: self._logger.error('%r: %s caused %r', name, request[0], req_exc) result = None else: req_exc = None reply_q.put((name, result, req_exc)) except Exception as exc: # pragma no cover # This can easily happen if we take a long time to allocate and # we get 'cleaned-up' before we get started. if self._server_lock is not None: self._logger.error('%r: %r', name, exc) finally: self._logger.debug('%r releasing server', name) RAM.release(server) reply_q.put((name, True, None)) # ACK shutdown.
def _execute_remote(self): """ Allocate a server based on required resources, send inputs, run command, and retrieve results. """ # Allocate server. self._server, server_info = RAM.allocate(self.resources) if self._server is None: self.raise_exception('Server allocation failed :-(', RuntimeError) return_code = -88888888 error_msg = '' try: # Send inputs. patterns = [] for metadata in self.external_files: if metadata.get('input', False): patterns.append(metadata.path) if patterns: self._send_inputs(patterns) else: self._logger.debug("No input metadata paths") # Run command. self._logger.info("executing '%s'...", self.command) start_time = time.time() return_code, error_msg = \ self._server.execute_command(self.command, self.stdin, self.stdout, self.stderr, self.env_vars, self.poll_delay, self.timeout) et = time.time() - start_time if et >= 60: #pragma no cover self._logger.info('elapsed time: %f sec.', et) # Retrieve results. patterns = [] for metadata in self.external_files: if metadata.get('output', False): patterns.append(metadata.path) if patterns: self._retrieve_results(patterns) else: self._logger.debug("No output metadata paths") finally: RAM.release(self._server) self._server = None return (return_code, error_msg)
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... # work in progress; running efficiently at NREL. if (options.cluster_allocator): from PeregrineClusterAllocator import ClusterAllocator cluster=ClusterAllocator() RAM.insert_allocator(0,cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_run_cases(ctrl.cases, options) # and a turbine---never used this "stub" # turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, ctrl.output, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created # # # # # # # # # # # dispatcher.presetup_workflow(aerocode, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). but now it is done already (above), too. # norun does not write directories, but it does set us up to process them if they already exist if (not options.norun): print "calling run" dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output)
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options.main_input, options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... if (options.cluster_allocator): cluster=ClusterAllocator() RAM.insert_allocator(0,cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_load_cases(ctrl.cases, options) # and a turbine turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created dispatcher.presetup_workflow(aerocode, turbine, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). if (not options.norun): dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output) sctx = sampler.Context() field_idx = 20 # = RootMyc1Std final_load_calc(sctx, "dlcproto.out", not dispatcher.raw_cases, field_idx)
def run_suite(resource_desc=None, name=None): """ Run suite of tests using `resource_desc` and resord under `name`. """ resource_desc = resource_desc or {} name = name or '' print '\n%s' % name initial = 0.01 limit = 20 results = {} max_servers = ResourceAllocationManager.max_servers(resource_desc) print 'max servers', max_servers model = CID() model.driver.reload_model = False model.driver.sequential = False # Save to an egg to avoid analysis overhead during run_test(). print '\nInitializing egg module analysis' template = Case(inputs=[('sleeper.delay', None, 0.01)]) model.driver.iterator = Iterator(template) model.driver.recorders = [Recorder(model.driver.iterator, 1000)] start = time.time() egg_filename, required_distributions, orphan_modules = \ model.save_to_egg('caseperf', '0') et = time.time() - start print ' done in %.2f' % et os.remove(egg_filename) print results = run_test(model, initial, limit, max_servers) record_results(results, name)
def setUp(self): self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] if self.node.startswith('gxterm'): # User environment assumed OK on this GRC cluster front-end. for i in range(1, 55): self.machines.append({'hostname':'gx%02d' % i, 'python':self.python}) else: self.machines.append({'hostname':self.node, 'python':self.python}) # Ensure we aren't held up by local host load problems. for allocator in ResourceAllocationManager.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?')
def get_ram(self): """ Returns the :class:`ResourceAllocationManager` instance. Used by :meth:`ResourceAllocationManager.add_remotes`. """ from openmdao.main.resource import ResourceAllocationManager return ResourceAllocationManager._get_instance()
def setup_cluster(encrypted=True): """ Use openmdao.testing.cluster.init_cluster, but fix 'max_load'. """ name = init_cluster(encrypted, allow_shell=True) for allocator in ResourceAllocationManager.list_allocators(): if allocator.name == 'LocalHost': allocator.max_load = 1. return name
def get_ram(self): """ Returns the :class:`ResourceAllocationManager` instance. Used by :meth:`ResourceAllocationManager.add_remotes`. """ from openmdao.main.resource import ResourceAllocationManager return ResourceAllocationManager.get_instance()
def InitializeCluster(hostnames, pydir, identity_filename=None): print 'Connecting to cluster...' machines = [] for host in hostnames: machines.append(ClusterHost( hostname=host, python = pydir, tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) _SSH.extend(['-o', 'StrictHostKeyChecking=no']) #somewhat dangerous, this automatically adds the host key to known_hosts cluster = ClusterAllocator('PCCCluster', machines, allow_shell=True) RAM.insert_allocator(0, cluster) print 'Servers connected on cluster:',cluster.max_servers({})[0] global UseCluster UseCluster = True
def tearDown(self): try: logging.debug("remove") RAM.remove_allocator(self.allocator.name) if self.proc is not None: logging.debug("shutdown") self.allocator.shutdown() self.proc.terminate() else: self.allocator.invalidate() # Restore 'ssh' and 'scp' configuration. protocol.configure_ssh(self.orig_ssh) protocol.configure_scp(self.orig_scp) time.sleep(2) for name in (_RJE_ROOT, _DMZ_ROOT): if os.path.exists(name): shutil.rmtree(name) finally: os.chdir(self.orig_dir)
def tearDown(self): try: logging.debug('remove') RAM.remove_allocator(self.allocator.name) if self.proc is not None: logging.debug('shutdown') self.allocator.shutdown() self.proc.terminate() else: self.allocator.invalidate() # Restore 'ssh' and 'scp' configuration. protocol.configure_ssh(self.orig_ssh) protocol.configure_scp(self.orig_scp) time.sleep(2) for name in (_RJE_ROOT, _DMZ_ROOT): if os.path.exists(name): shutil.rmtree(name) finally: os.chdir(self.orig_dir)
def test_allocator(self): logging.debug("") logging.debug("test_allocator") # Since we're faking it with a remote LocalHost, we should match. local_servers = RAM.max_servers(dict(allocator="LocalHost")) max_servers = RAM.max_servers(dict(allocator=self.allocator.name)) self.assertEqual(max_servers, local_servers) max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True)) # Contradictory! self.assertEqual(max_servers, 0) server = self.allocator.deploy("test_server", {}, {}) try: self.assertEqual(server.name, "NAS_Allocator/test_server") self.assertEqual(server.host, socket.gethostname()) self.assertTrue(server.pid > 0) retval = server.echo(123, "twisty", "narrow", "passages") self.assertEqual(retval, (123, "twisty", "narrow", "passages")) self.assertTrue(server.isdir(".")) self.assertEqual(sorted(server.listdir(".")), ["openmdao_log.txt", "stderr", "stdout"]) finally: self.allocator.release(server)
def init_cluster(encrypted=True, clean_dir=True, allow_shell=False): """ If not already done, initializes the ResourceAllocationManager and adds a cluster using encrypted or unencrypted communication. Returns the name of the configured cluster. """ authkey = 'PublicKey' if encrypted else 'AuthKey' allocators = ResourceAllocationManager.list_allocators() if len(allocators) == 1: local = ResourceAllocationManager.get_allocator(0) if local.max_load < 10: # First time we've been called. # Ensure we aren't held up by local host load problems. local.max_load = 10 if clean_dir: # Remove any local allocator-created directories. for path in glob.glob('Sim-*'): shutil.rmtree(path, onerror=onerror) node = platform.node() name = '%s_%s' % (node.replace('.', '_'), authkey) for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. machines = [] python = sys.executable if node.startswith('gxterm'): # User environment assumed OK on this GRC cluster front-end. # Using less than full machine (55 nodes) to allow multiple # cluster testing without hitting limit on open files (sockets). for i in range(20): machines.append({'hostname': 'gx%02d' % i, 'python': python}) elif local_ssh_available(): machines.append({'hostname': node, 'python': python}) if machines: cluster = ClusterAllocator(name, machines, authkey, allow_shell) ResourceAllocationManager.insert_allocator(0, cluster) return name elif not encrypted: # Create a LocalAllocator so we have *something*. name = 'LocalUnencrypted' for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. local = LocalAllocator(name, authkey=authkey, allow_shell=allow_shell) ResourceAllocationManager.insert_allocator(0, local) return name return None
def setup(self, replicate=True): """ Setup to begin new run. replicate: bool If True, then replicate the model and save to an egg file first (for concurrent evaluation). """ self._cleanup(remove_egg=replicate) if not self.sequential: if replicate or self._egg_file is None: # Save model to egg. # Must do this before creating any locks or queues. self._replicants += 1 version = 'replicant.%d' % (self._replicants) # If only local host will be used, we can skip determining # distributions required by the egg. allocators = RAM.list_allocators() need_reqs = False if not self.ignore_egg_requirements: for allocator in allocators: if not isinstance(allocator, LocalAllocator): need_reqs = True break driver = self.parent.driver self.parent.add('driver', Driver()) # this driver will execute the workflow once self.parent.driver.workflow = self.workflow try: #egg_info = self.model.save_to_egg(self.model.name, version) # FIXME: what name should we give to the egg? egg_info = self.parent.save_to_egg(self.name, version, need_requirements=need_reqs) finally: self.parent.driver = driver self._egg_file = egg_info[0] self._egg_required_distributions = egg_info[1] self._egg_orphan_modules = [name for name, path in egg_info[2]] self._iter = self.get_case_iterator() self._seqno = 0
def run_serial(self): """ Run serial version of ADPAC. Runs on remote host if there's more than just the local allocator. """ try: allocator = RAM.get_allocator(1) except IndexError: self.resources = {} else: self.resources = {'n_cpus': 1} self.command = [self.serial_adpac] if not self.idissf: self.command.append('-d') if self.irevs: self.command.append('-r') self.stdin = self.input.casename+'.input' self.stdout = self.input.casename+'.output' self.stderr = ExternalCode.STDOUT super(ADPAC, self).execute()
def setUp(self): self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] self.machines.append({'hostname':self.node, 'python':self.python}) # Ensure we aren't held up by local host load problems. for allocator in ResourceAllocationManager.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?')
def test_resources(self): logging.debug('') logging.debug('test_resources') result = ResourceAllocationManager.allocate({'localhost':False}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'exclude':[platform.node()]}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'n_cpus':1000000}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'orphan_modules':['xyzzy']}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'python_version':'xyzzy'}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'xyzzy':None}) self.assertEqual(result, (None, None))
def main(): #pragma no cover """ OpenMDAO factory service process. Usage: python objserverfactory.py [--allow-public][--allow-shell][--hosts=filename][--types=filename][--users=filename][--address=address][--port=number][--prefix=name][--tunnel][--resources=filename][--log-host=hostname][--log-port=number][--log-prefix=string] --allow-public: Allows access by anyone from any allowed host. Use with care! --allow-shell: Allows access to :meth:`execute_command` and :meth:`load_model`. Use with care! --hosts: string Filename for allowed hosts specification. Default ``hosts.allow``. Ignored if '--users' is specified. The file should contain IPv4 host addresses, IPv4 domain addresses, or hostnames, one per line. Blank lines are ignored, and '#' marks the start of a comment which continues to the end of the line. For security reasons this file must be accessible only by the user running this server. --types: string Filename for allowed types specification. If not specified then allow types listed by :meth:`factorymanager.get_available_types`. The file should contain one type name per line. --users: string Filename for allowed users specification. Ignored if '--allow-public' is specified. Default is ``~/.ssh/authorized_keys``, other files should be of the same format: each line has ``key-type public-key-data user@host``, where `user` is the username on `host`. `host` will be translated to an IPv4 address and included in the allowed hosts list. Note that this ``user@host`` form is not necessarily enforced by programs which generate keys. For security reasons this file must be accessible only by the user running this server. --address: string IPv4 address, hostname, or pipe name. Default is the host's default IPv4 address. --port: int Server port (default of 0 implies next available port). Note that ports below 1024 typically require special privileges. If port is negative, then a local pipe is used for communication. --prefix: string Prefix for configuration and stdout/stderr files (default ``server``). --tunnel: Report host IP address but listen for connections from a local SSH tunnel. --resources: string Filename for resource configuration. If not specified then the default of ``~/.openmdao/resources.cfg`` will be used. --log-host: string Hostname to send remote log messages to. --log-port: int Port on `log-host` to send remote log messages to. --log-prefix: string Prefix to apply to remote log messages. Default is ``pid@host``. If ``prefix.key`` exists, it is read for an authorization key string. Otherwise public key authorization and encryption is used. Allowed hosts *must* be specified if `port` is >= 0. Only allowed hosts may connect to the server. Once initialized ``prefix.cfg`` is written with address, port, and public key information. """ parser = optparse.OptionParser() parser.add_option('--address', action='store', type='str', help='Network address to serve.') parser.add_option('--allow-public', action='store_true', default=False, help='Allows access by any user, use with care!') parser.add_option('--allow-shell', action='store_true', default=False, help='Allows potential shell access, use with care!') parser.add_option('--hosts', action='store', type='str', default='hosts.allow', help='Filename for allowed hosts') parser.add_option('--types', action='store', type='str', help='Filename for allowed types') parser.add_option('--users', action='store', type='str', default='~/.ssh/authorized_keys', help='Filename for allowed users') parser.add_option('--port', action='store', type='int', default=0, help='Server port (0 implies next available port)') parser.add_option('--prefix', action='store', default='server', help='Prefix for config and stdout/stderr files') parser.add_option('--tunnel', action='store_true', default=False, help='Report host IP address but listen for connections' ' from a local SSH tunnel') parser.add_option('--resources', action='store', type='str', default=None, help='Filename for resource configuration') parser.add_option('--log-host', action='store', type='str', default=None, help='hostname for remote log messages') parser.add_option('--log-port', action='store', type='int', default=None, help='port for remote log messages') parser.add_option('--log-prefix', action='store', type='str', default=None, help='prefix for remote log messages') options, arguments = parser.parse_args() if arguments: parser.print_help() sys.exit(1) logger = logging.getLogger() logger.setLevel(logging.DEBUG) if options.log_host and options.log_port: install_remote_handler(options.log_host, int(options.log_port), options.log_prefix) server_key = options.prefix+'.key' server_cfg = options.prefix+'.cfg' global _SERVER_CFG _SERVER_CFG = server_cfg # Get authkey. authkey = 'PublicKey' try: with open(server_key, 'r') as inp: authkey = inp.readline().strip() os.remove(server_key) except IOError: pass if options.allow_shell: msg = 'Shell access is ALLOWED' logger.warning(msg) print msg allowed_users = None allowed_hosts = None # Get allowed_users. if options.allow_public: allowed_users = None msg = 'Public access is ALLOWED' logger.warning(msg) print msg if options.port >= 0: # Get allowed_hosts. if os.path.exists(options.hosts): try: allowed_hosts = read_allowed_hosts(options.hosts) except Exception as exc: msg = "Can't read allowed hosts file %r: %s" \ % (options.hosts, exc) logger.error(msg) print msg sys.exit(1) else: msg = 'Allowed hosts file %r does not exist.' % options.hosts logger.error(msg) print msg sys.exit(1) if not allowed_hosts: msg = 'No hosts in allowed hosts file %r.' % options.hosts logger.error(msg) print msg sys.exit(1) else: if os.path.exists(options.users): try: allowed_users = read_authorized_keys(options.users, logger) except Exception as exc: msg = "Can't read allowed users file %r: %s" \ % (options.users, exc) logger.error(msg) print msg sys.exit(1) else: msg = 'Allowed users file %r does not exist.' % options.users logger.error(msg) print msg sys.exit(1) if not allowed_users: msg = 'No users in allowed users file %r.' % options.users logger.error(msg) print msg sys.exit(1) # Get allowed_types. allowed_types = None if options.types: if os.path.exists(options.types): allowed_types = [] with open(options.types, 'r') as inp: line = inp.readline() while line: line = line.strip() if line: allowed_types.append(line) line = inp.readline() else: msg = 'Allowed types file %r does not exist.' % options.types logger.error(msg) print msg sys.exit(1) # Optionally configure resources. if options.resources: # Import here to avoid import loop. from openmdao.main.resource import ResourceAllocationManager as RAM RAM.configure(options.resources) # Get address and create manager. if options.port >= 0: if options.address: # Specify IPv4/hostname. address = (options.address, options.port) else: address = (platform.node(), options.port) else: if options.address: # Specify pipename. address = options.address else: address = None logger.info('Starting FactoryManager %s %r', address, keytype(authkey)) current_process().authkey = authkey bind_address = ('127.0.0.1', options.port) if options.tunnel else address manager = _FactoryManager(bind_address, authkey, name='Factory', allowed_hosts=allowed_hosts, allowed_users=allowed_users, allow_tunneling=options.tunnel) # Set defaults for created ObjServerFactories. # There isn't a good method to propagate these through the manager. ObjServerFactory._address = address ObjServerFactory._allow_shell = options.allow_shell ObjServerFactory._allowed_types = allowed_types ObjServerFactory._allow_tunneling = options.tunnel # Get server, retry if specified address is in use. server = None retries = 0 while server is None: try: server = manager.get_server() except socket.error as exc: if str(exc).find('Address already in use') >= 0: if retries < 10: msg = 'Address %s in use, retrying...' % (address,) logger.debug(msg) print msg time.sleep(5) retries += 1 else: msg = 'Address %s in use, too many retries.' % (address,) logger.error(msg) print msg sys.exit(1) else: raise # Record configuration. real_ip = None if address is None else address[0] write_server_config(server, _SERVER_CFG, real_ip) msg = 'Serving on %s' % (server.address,) logger.info(msg) print msg sys.stdout.flush() # And away we go... signal.signal(signal.SIGTERM, _sigterm_handler) try: server.serve_forever() finally: _cleanup() sys.exit(0)
def _execute_remote(self): """ Allocate a server based on required resources, send inputs, run command, and retrieve results. """ rdesc = self.resources.copy() # Allocate server. self._server, server_info = RAM.allocate(rdesc) if self._server is None: self.raise_exception('Server allocation failed :-(', RuntimeError) if self._logger.level == logging.NOTSET: # By default avoid lots of protocol messages. self._server.set_log_level(logging.DEBUG) else: self._server.set_log_level(self._logger.level) return_code = -88888888 error_msg = '' try: # Create resource description for command. rdesc['job_name'] = self.get_pathname() rdesc['remote_command'] = self.command[0] if len(self.command) > 1: rdesc['args'] = self.command[1:] if self.env_vars: rdesc['job_environment'] = self.env_vars if not self.stdin: self.raise_exception('Remote execution requires stdin of' ' DEV_NULL or filename, got %r' % self.stdin, ValueError) if self.stdin != self.DEV_NULL: rdesc['input_path'] = self.stdin if self.stdout: rdesc['output_path'] = self.stdout else: rdesc['output_path'] = '%s.stdout' % self.command[0] if self.stderr: if self.stderr == self.STDOUT: rdesc['join_files'] = True else: rdesc['error_path'] = self.stderr else: rdesc['error_path'] = '%s.stderr' % self.command[0] if self.timeout: if 'resource_limits' in rdesc: limits = rdesc['resource_limits'].copy() else: limits = {} limits['wallclock_time'] = self.timeout rdesc['resource_limits'] = limits # Send inputs. patterns = [] textfiles = [] for metadata in self.external_files: if metadata.get('input', False): patterns.append(metadata.path) if not metadata.binary: textfiles.append(metadata.path) for pathname, obj in self.items(iotype='in', recurse=True): if isinstance(obj, FileRef): local_path = self.get_metadata(pathname, 'local_path') if local_path: patterns.append(local_path) if not obj.binary: textfiles.append(local_path) if self.stdin and self.stdin != self.DEV_NULL: patterns.append(self.stdin) textfiles.append(self.stdin) if patterns: self._send_inputs(patterns, textfiles) else: self._logger.debug('No input files') # Run command. self._logger.info('executing %s...', self.command) start_time = time.time() return_code, error_msg = \ self._server.execute_command(rdesc) et = time.time() - start_time if et >= 60: #pragma no cover self._logger.info('elapsed time: %.1f sec.', et) # Retrieve results. patterns = [] textfiles = [] for metadata in self.external_files: if metadata.get('output', False): patterns.append(metadata.path) if not metadata.binary: textfiles.append(metadata.path) for pathname, obj in self.items(iotype='out', recurse=True): if isinstance(obj, FileRef): patterns.append(obj.path) if not obj.binary: textfiles.append(obj.path) patterns.append(rdesc['output_path']) textfiles.append(rdesc['output_path']) if self.stderr != self.STDOUT: patterns.append(rdesc['error_path']) textfiles.append(rdesc['error_path']) self._retrieve_results(patterns, textfiles) # Echo stdout if not redirected. if not self.stdout: name = rdesc['output_path'] if os.path.exists(name): with open(name, 'rU') as inp: sys.stdout.write(inp.read()) os.remove(name) else: sys.stdout.write('\n[No stdout available]\n') # Echo stderr if not redirected. if not self.stderr: name = rdesc['error_path'] if os.path.exists(name): with open(name, 'rU') as inp: sys.stderr.write(inp.read()) os.remove(name) else: sys.stdout.write('\n[No stderr available]\n') finally: RAM.release(self._server) self._server = None return (return_code, error_msg)
def _start(self): """ Start evaluating cases concurrently. """ # Need credentials in case we're using a PublicKey server. credentials = get_credentials() # Determine maximum number of servers available. resources = { 'required_distributions':self._egg_required_distributions, 'orphan_modules':self._egg_orphan_modules, 'python_version':sys.version[:3]} if self.extra_resources: resources.update(self.extra_resources) max_servers = RAM.max_servers(resources) self._logger.debug('max_servers %d', max_servers) if max_servers <= 0: msg = 'No servers supporting required resources %s' % resources self.raise_exception(msg, RuntimeError) # Kick off initial wave of cases. self._server_lock = threading.Lock() self._reply_q = Queue.Queue() self._generation += 1 n_servers = 0 while n_servers < max_servers: if not self._more_to_go(): break # Get next case. Limits servers started if max_servers > cases. try: case = self._iter.next() except StopIteration: if not self._rerun: self._iter = None self._seqno = 0 break self._seqno += 1 self._todo.append((case, self._seqno)) # Start server worker thread. n_servers += 1 name = '%s_%d_%d' % (self.name, self._generation, n_servers) self._logger.debug('starting worker for %r', name) self._servers[name] = None self._in_use[name] = True self._server_cases[name] = None self._server_states[name] = _EMPTY self._load_failures[name] = 0 server_thread = threading.Thread(target=self._service_loop, args=(name, resources, credentials, self._reply_q)) server_thread.daemon = True try: server_thread.start() except thread.error: self._logger.warning('worker thread startup failed for %r', name) self._in_use[name] = False break if sys.platform != 'win32': # Process any pending events. while self._busy(): try: name, result, exc = self._reply_q.get(True, 0.01) except Queue.Empty: break # Timeout. else: # Difficult to force startup failure. if self._servers[name] is None: #pragma nocover self._logger.debug('server startup failed for %r', name) self._in_use[name] = False else: self._in_use[name] = self._server_ready(name) if sys.platform == 'win32': #pragma no cover # Don't start server processing until all servers are started, # otherwise we have egg removal issues. for name in self._in_use.keys(): name, result, exc = self._reply_q.get() if self._servers[name] is None: self._logger.debug('server startup failed for %r', name) self._in_use[name] = False # Kick-off started servers. for name in self._in_use.keys(): if self._in_use[name]: self._in_use[name] = self._server_ready(name) # Continue until no servers are busy. while self._busy(): if self._more_to_go(): timeout = None else: # Don't wait indefinitely for a server we don't need. # This has happened with a server that got 'lost' # in RAM.allocate() timeout = 60 try: name, result, exc = self._reply_q.get(timeout=timeout) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover msgs = [] for name, in_use in self._in_use.items(): if in_use: try: server = self._servers[name] info = self._server_info[name] except KeyError: msgs.append('%r: no startup reply' % name) self._in_use[name] = False else: state = self._server_states[name] if state not in (_LOADING, _EXECUTING): msgs.append('%r: %r %s %s' % (name, self._servers[name], state, self._server_info[name])) self._in_use[name] = False if msgs: self._logger.error('Timeout waiting with nothing left to do:') for msg in msgs: self._logger.error(' %s', msg) else: self._in_use[name] = self._server_ready(name) # Shut-down (started) servers. self._logger.debug('Shut-down (started) servers') for queue in self._queues.values(): queue.put(None) for i in range(len(self._queues)): try: name, status, exc = self._reply_q.get(True, 60) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover pass else: if name in self._queues: # 'Stale' worker can reply *late*. del self._queues[name] # Hard to force worker to hang, which is handled here. for name in self._queues.keys(): #pragma no cover self._logger.warning('Timeout waiting for %r to shut-down.', name)
def _setup(self): """ Setup to begin new run. """ if not self.sequential: # Save model to egg. # Must do this before creating any locks or queues. self._replicants += 1 version = 'replicant.%d' % (self._replicants) # If only local host will be used, we can skip determining # distributions required by the egg. allocators = RAM.list_allocators() need_reqs = False if not self.ignore_egg_requirements: for allocator in allocators: if not isinstance(allocator, LocalAllocator): need_reqs = True break # Replicate and mutate model to run our workflow once. # Originally this was done in-place, but that 'invalidated' # various workflow quantities. replicant = self.parent.copy() workflow = replicant.get(self.name+'.workflow') driver = replicant.add('driver', Driver()) workflow.parent = driver workflow.scope = None replicant.driver.workflow = workflow egg_info = replicant.save_to_egg(self.name, version, need_requirements=need_reqs) replicant = workflow = driver = None # Release objects. gc.collect() # Collect/compact before possible fork. self._egg_file = egg_info[0] self._egg_required_distributions = egg_info[1] self._egg_orphan_modules = [name for name, path in egg_info[2]] inp_paths = [] inp_values = [] for path, param in self.get_parameters().items(): if isinstance(path, tuple): path = path[0] # Use first target of ParameterGroup. path = make_legal_path(path) value = self.get('case_inputs.'+path) for target in param.targets: inp_paths.append(target) inp_values.append(value) outputs = self.get_responses().keys() extra_outputs = self.workflow._rec_outputs length = len(inp_values[0]) if inp_values else 0 cases = [] for i in range(length): inputs = [] for j in range(len(inp_paths)): inputs.append((inp_paths[j], inp_values[j][i])) cases.append(_Case(i, inputs, outputs, extra_outputs, parent_uuid=self._case_uuid)) self.init_responses(length) self._iter = iter(cases) self._abort_exc = None