def InitializeCluster(hostnames, pydir, identity_filename=None): print 'Connecting to cluster...' machines = [] for host in hostnames: machines.append(ClusterHost( hostname=host, python = pydir, tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) _SSH.extend(['-o', 'StrictHostKeyChecking=no']) #somewhat dangerous, this automatically adds the host key to known_hosts cluster = ClusterAllocator('PCCCluster', machines, allow_shell=True) RAM.insert_allocator(0, cluster) print 'Servers connected on cluster:',cluster.max_servers({})[0] global UseCluster UseCluster = True
class TestCase(unittest.TestCase): """ Test resource allocation. """ def setUp(self): self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] if self.node.startswith('gxterm'): # User environment assumed OK on this GRC cluster front-end. for i in range(1, 55): self.machines.append({'hostname':'gx%02d' % i, 'python':self.python}) else: self.machines.append({'hostname':self.node, 'python':self.python}) # Ensure we aren't held up by local host load problems. for allocator in ResourceAllocationManager.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?') def tearDown(self): # shutdown() currently causes problems (except at exit). # if self.cluster is not None: # self.cluster.shutdown() if self.skip_ssh or self.node.startswith('gxterm'): return # This cleanup *should* be OK, but it's not bulletproof. uid = os.getuid() tempdir = tempfile.gettempdir() for path in glob.glob(os.path.join(tempdir, 'omdao-*')): info = os.stat(path) if info.st_uid == uid: shutil.rmtree(path) def test_cluster(self): logging.debug('') logging.debug('test_cluster') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.cluster = ClusterAllocator(self.name, self.machines) if self.node.startswith('gxterm'): # GX isn't particularly reliable for some reason. self.assertTrue(len(self.cluster) >= len(self.machines)*3/4) else: self.assertEqual(len(self.cluster), len(self.machines)) n_servers = self.cluster.max_servers({'python_version':sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except AttributeError: n_cpus = 1 if self.node.startswith('gxterm'): # GX front-end n_cpus doesn't imply node n_cpus. self.assertTrue(n_servers >= len(self.cluster)) else: self.assertEqual(n_servers, len(self.cluster)*n_cpus) n_servers = self.cluster.max_servers({'python_version':'bad-version'}) self.assertEqual(n_servers, 0) def test_max_servers(self): logging.debug('') logging.debug('test_max_servers') n_servers = self.local.max_servers({'python_version':sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except AttributeError: n_cpus = 1 self.assertEqual(n_servers, self.local.max_load * n_cpus) n_servers = self.local.max_servers({'python_version':'bad-version'}) self.assertEqual(n_servers, 0) def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = ResourceAllocationManager.get_hostnames({'n_cpus':1}) self.assertEqual(hostnames[0], platform.node()) hostnames = ResourceAllocationManager.get_hostnames({'no_such_resource':1}) self.assertEqual(hostnames, None) def test_resources(self): logging.debug('') logging.debug('test_resources') result = ResourceAllocationManager.allocate({'localhost':False}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'exclude':[platform.node()]}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'n_cpus':1000000}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'orphan_modules':['xyzzy']}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'python_version':'xyzzy'}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'xyzzy':None}) self.assertEqual(result, (None, None)) def test_bad_host(self): logging.debug('') logging.debug('test_bad_host') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines.append({'hostname':'xyzzy', 'python':self.python}) self.cluster = ClusterAllocator(self.name, self.machines) if self.node.startswith('gxterm'): # GX isn't particularly reliable for some reason. self.assertTrue(len(self.cluster) >= len(self.machines)*3/4) else: self.assertEqual(len(self.cluster), len(self.machines)-1) def test_bad_python(self): logging.debug('') logging.debug('test_bad_python') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines = [{'hostname':self.node, 'python':'no-such-python'}] self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), 0)
class TestCase(unittest.TestCase): """ Test resource allocation. """ def setUp(self): self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] self.machines.append({'hostname': self.node, 'python': self.python}) # Ensure we aren't held up by local host load problems. for allocator in ResourceAllocationManager.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?') def tearDown(self): # shutdown() currently causes problems (except at exit). # if self.cluster is not None: # self.cluster.shutdown() if self.skip_ssh: return # This cleanup *should* be OK, but it's not bulletproof. uid = os.getuid() tempdir = tempfile.gettempdir() for path in glob.glob(os.path.join(tempdir, 'omdao-*')): info = os.stat(path) if info.st_uid == uid: shutil.rmtree(path) def test_cluster(self): logging.debug('') logging.debug('test_cluster') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines)) n_servers = self.cluster.max_servers( {'python_version': sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): # pragma no cover n_cpus = 1 self.assertEqual(n_servers, len(self.cluster) * n_cpus) n_servers = self.cluster.max_servers({'python_version': 'bad-version'}) self.assertEqual(n_servers, 0) def test_max_servers(self): logging.debug('') logging.debug('test_max_servers') n_servers = self.local.max_servers({'python_version': sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): n_cpus = 1 self.assertEqual(n_servers, self.local.max_load * n_cpus) n_servers = self.local.max_servers({'python_version': 'bad-version'}) self.assertEqual(n_servers, 0) def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = ResourceAllocationManager.get_hostnames({'n_cpus': 1}) self.assertEqual(hostnames[0], platform.node()) hostnames = ResourceAllocationManager.get_hostnames( {'no_such_resource': 1}) self.assertEqual(hostnames, None) def test_resources(self): logging.debug('') logging.debug('test_resources') result = ResourceAllocationManager.allocate({'localhost': False}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate( {'exclude': [platform.node()]}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'n_cpus': 1000000}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate( {'orphan_modules': ['xyzzy']}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate( {'python_version': 'xyzzy'}) self.assertEqual(result, (None, None)) result = ResourceAllocationManager.allocate({'xyzzy': None}) self.assertEqual(result, (None, None)) def test_bad_host(self): logging.debug('') logging.debug('test_bad_host') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines.append({'hostname': 'xyzzy', 'python': self.python}) self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines) - 1) def test_bad_python(self): logging.debug('') logging.debug('test_bad_python') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines = [{'hostname': self.node, 'python': 'no-such-python'}] self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), 0)
class TestCase(unittest.TestCase): """ Test resource allocation. """ def setUp(self): # Save existing RAM instance and force a rebuild. self.orig_ram = RAM._RAM RAM._RAM = None RAM.configure('') self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] self.machines.append({'hostname': self.node, 'python': self.python}) # Ensure we aren't held up by local host load problems. for allocator in RAM.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?') def tearDown(self): # shutdown() currently causes problems (except at exit). # if self.cluster is not None: # self.cluster.shutdown() # Restore RAM. RAM._RAM = self.orig_ram if self.skip_ssh: return # This cleanup *should* be OK, but it's not bulletproof. uid = os.getuid() tempdir = tempfile.gettempdir() for path in glob.glob(os.path.join(tempdir, 'omdao-*')): info = os.stat(path) if info.st_uid == uid: shutil.rmtree(path, onerror=onerror) def test_cluster(self): logging.debug('') logging.debug('test_cluster') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines)) n_servers, criteria = \ self.cluster.max_servers({'python_version': sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): # pragma no cover n_cpus = 1 self.assertEqual(n_servers, len(self.cluster)*n_cpus) n_servers, criteria = \ self.cluster.max_servers({'python_version': '2.999'}) self.assertEqual(n_servers, 0) def test_max_servers(self): logging.debug('') logging.debug('test_max_servers') n_servers, criteria = \ self.local.max_servers({'python_version': sys.version[:3], 'min_cpus': 1}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): n_cpus = 1 # Default server_limit limits to just n_cpus. self.assertEqual(n_servers, n_cpus) n_servers, criteria = \ self.local.max_servers({'min_cpus': 1000}) self.assertEqual(n_servers, 0) n_servers, criteria = \ self.local.max_servers({'python_version': '2.999'}) self.assertEqual(n_servers, 0) def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = RAM.get_hostnames({'min_cpus': 1}) self.assertEqual(hostnames[0], platform.node()) hostnames = RAM.get_hostnames({'allocator': 'LocalHost', 'localhost': False}) self.assertEqual(hostnames, None) def test_resources(self): logging.debug('') logging.debug('test_resources') result = RAM.allocate({'localhost': False}) self.assertEqual(result, (None, None)) result = RAM.allocate({'exclude': [platform.node()]}) self.assertEqual(result, (None, None)) result = RAM.allocate({'min_cpus': 1000000}) self.assertEqual(result, (None, None)) result = RAM.allocate({'orphan_modules': ['xyzzy']}) self.assertEqual(result, (None, None)) result = RAM.allocate({'python_version': '2.999'}) self.assertEqual(result, (None, None)) start_time = datetime.datetime(2012, 2, 8, 16, 42) resource_limits = {} for i, limit in enumerate(RESOURCE_LIMITS): resource_limits[limit] = i RAM.validate_resources(dict(remote_command='echo', args=['hello', 'world'], submit_as_hold=True, rerunnable=True, job_environment={'ENV_VAR': 'env_value'}, working_directory='.', job_category='MPI', min_cpus=256, max_cpus=512, email=['user1@host1', 'user2@host2'], email_on_started=True, email_on_terminated=True, job_name='TestJob', input_path='echo.in', output_path='echo.out', error_path='echo.err', join_files=True, reservation_id='res-1234', queue_name='debug_q', priority=42, start_time=start_time, resource_limits=resource_limits, accounting_id='CFD-R-US', native_specification=('-ac', 'name=value'))) code = "RAM.validate_resources(dict(max_cpus=2))" assert_raises(self, code, globals(), locals(), KeyError, "'min_cpus required if max_cpus specified'") code = "RAM.validate_resources(dict(min_cpus=2, max_cpus=1))" assert_raises(self, code, globals(), locals(), ValueError, "max_cpus 1 < min_cpus 2") # Must be positive. code = "RAM.validate_resources(dict(min_cpus=-2))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'min_cpus': -2") # Must be sequence. code = "RAM.validate_resources(dict(args='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'args': 'hello'") # Must be strings. code = "RAM.validate_resources(dict(args=['hello', 42]))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'args': ['hello', 42]") # Must be registered allocator. code = "RAM.validate_resources(dict(allocator='NoSuchAllocator'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'allocator': 'NoSuchAllocator'") # Must be dict. code = "RAM.validate_resources(dict(job_environment='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment': 'hello'") # Key must be string. env = {} env[3] = 'hello' code = "RAM.validate_resources(dict(job_environment=env))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {3: 'hello'}") # Key must not have whitespace. env = {} env['hello there'] = 'world' code = "RAM.validate_resources(dict(job_environment=env))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {'hello there': 'world'}") # Value must be string. env = {} env['hello'] = 3 code = "RAM.validate_resources(dict(job_environment=env))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {'hello': 3}") # Must be dict. code = "RAM.validate_resources(dict(resource_limits='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits': 'hello'") # Must be known key. limits = {} limits['no-such-resource'] = 1 code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'no-such-resource': 1}") # Value must be int. limits = {} limits['wallclock_time'] = 'infinite' code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'wallclock_time': 'infinite'}") # Value must be >= 0. limits = {} limits['wallclock_time'] = -1 code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'wallclock_time': -1}") # Must be known resource. code = "RAM.validate_resources(dict(no_such_resource=2))" assert_raises(self, code, globals(), locals(), KeyError, '"Invalid resource key \'no_such_resource\'"') def test_bad_host(self): logging.debug('') logging.debug('test_bad_host') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines.append({'hostname': 'xyzzy', 'python': self.python}) self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines)-1) def test_bad_python(self): logging.debug('') logging.debug('test_bad_python') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines = [{'hostname': self.node, 'python': 'no-such-python'}] self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), 0) def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse(RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue(RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0]) def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2") def test_base(self): logging.debug('') logging.debug('test_base') assert_raises(self, "ResourceAllocator('invalid-name')", globals(), locals(), NameError, "name 'invalid-name' is not alphanumeric") allocator = ResourceAllocator('dummy') self.assertEqual(allocator.name, 'dummy') # Just show they can be called. allocator.invalidate() allocator.configure('') retcode, info = \ allocator.check_compatibility({'remote_command': 'echo', 'no-such-key': True}) self.assertEqual(retcode, 0) self.assertEqual(info, ['no-such-key']) assert_raises(self, "allocator.max_servers(dict())", globals(), locals(), NotImplementedError, 'max_servers') assert_raises(self, "allocator.time_estimate(dict())", globals(), locals(), NotImplementedError, 'time_estimate') assert_raises(self, "allocator.deploy('xyzzy', dict(), dict())", globals(), locals(), NotImplementedError, 'deploy') assert_raises(self, "allocator.release(None)", globals(), locals(), NotImplementedError, 'release') def test_request(self): logging.debug('') logging.debug('test_request') assembly = Assembly() comp1 = assembly.add('comp1', ExtCode()) comp2 = assembly.add('comp2', ExtCode()) sub = assembly.add('sub', Assembly()) comp3 = sub.add('comp3', ExtCode()) comp1.resources = dict(min_cpus=10, max_cpus=10, resource_limits=dict(virtual_memory=100, cpu_time=120), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') comp2.resources = dict(max_cpus=2, resource_limits=dict(wallclock_time=1000000)) comp3.resources = dict(min_cpus=200, resource_limits=dict(virtual_memory=20, cpu_time=1000, wallclock_time=500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') req = RAM.max_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1000, wallclock_time=1000000)) logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) req = RAM.total_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1120, wallclock_time=1000500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) comp3.resources['accounting_id'] = 'xyzzy' assert_raises(self, 'RAM.total_request(assembly)', globals(), locals(), ValueError, "Incompatible settings for 'accounting_id':" " 'xyzzy' vs. 'frobozz'")
class TestCase(unittest.TestCase): """ Test resource allocation. """ def setUp(self): # Save existing RAM instance and force a rebuild. self.orig_ram = RAM._RAM RAM._RAM = None RAM.configure('') self.user = getpass.getuser() self.node = platform.node() self.name = self.node.replace('.', '_') self.python = find_python() self.cluster = None if sys.platform == 'win32' or self.user not in SSH_USERS: self.skip_ssh = True else: self.skip_ssh = False self.machines = [] self.machines.append({'hostname': self.node, 'python': self.python}) # Ensure we aren't held up by local host load problems. for allocator in RAM.list_allocators(): if allocator.name == 'LocalHost': self.local = allocator self.local.max_load = 10 break else: raise RuntimeError('No LocalHost allocator!?') def tearDown(self): # shutdown() currently causes problems (except at exit). # if self.cluster is not None: # self.cluster.shutdown() # Restore RAM. RAM._RAM = self.orig_ram if self.skip_ssh: return # This cleanup *should* be OK, but it's not bulletproof. uid = os.getuid() tempdir = tempfile.gettempdir() for path in glob.glob(os.path.join(tempdir, 'omdao-*')): info = os.stat(path) if info.st_uid == uid: shutil.rmtree(path, onerror=onerror) def test_cluster(self): logging.debug('') logging.debug('test_cluster') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines)) n_servers, criteria = \ self.cluster.max_servers({'python_version': sys.version[:3]}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): # pragma no cover n_cpus = 1 self.assertEqual(n_servers, len(self.cluster) * n_cpus) n_servers, criteria = \ self.cluster.max_servers({'python_version': '2.999'}) self.assertEqual(n_servers, 0) def test_max_servers(self): logging.debug('') logging.debug('test_max_servers') n_servers, criteria = \ self.local.max_servers({'python_version': sys.version[:3], 'min_cpus': 1}) try: n_cpus = multiprocessing.cpu_count() except (AttributeError, NotImplementedError): n_cpus = 1 # Default server_limit limits to just n_cpus. self.assertEqual(n_servers, n_cpus) n_servers, criteria = \ self.local.max_servers({'min_cpus': 1000}) self.assertEqual(n_servers, 0) n_servers, criteria = \ self.local.max_servers({'python_version': '2.999'}) self.assertEqual(n_servers, 0) def test_hostnames(self): logging.debug('') logging.debug('test_hostnames') hostnames = RAM.get_hostnames({'min_cpus': 1}) self.assertEqual(hostnames[0], platform.node()) hostnames = RAM.get_hostnames({ 'allocator': 'LocalHost', 'localhost': False }) self.assertEqual(hostnames, None) def test_resources(self): logging.debug('') logging.debug('test_resources') result = RAM.allocate({'localhost': False}) self.assertEqual(result, (None, None)) result = RAM.allocate({'exclude': [platform.node()]}) self.assertEqual(result, (None, None)) result = RAM.allocate({'min_cpus': 1000000}) self.assertEqual(result, (None, None)) result = RAM.allocate({'orphan_modules': ['xyzzy']}) self.assertEqual(result, (None, None)) result = RAM.allocate({'python_version': '2.999'}) self.assertEqual(result, (None, None)) start_time = datetime.datetime(2012, 2, 8, 16, 42) resource_limits = {} for i, limit in enumerate(RESOURCE_LIMITS): resource_limits[limit] = i RAM.validate_resources( dict(remote_command='echo', args=['hello', 'world'], submit_as_hold=True, rerunnable=True, job_environment={'ENV_VAR': 'env_value'}, working_directory='.', job_category='MPI', min_cpus=256, max_cpus=512, email=['user1@host1', 'user2@host2'], email_on_started=True, email_on_terminated=True, job_name='TestJob', input_path='echo.in', output_path='echo.out', error_path='echo.err', join_files=True, reservation_id='res-1234', queue_name='debug_q', priority=42, start_time=start_time, resource_limits=resource_limits, accounting_id='CFD-R-US', native_specification=('-ac', 'name=value'))) code = "RAM.validate_resources(dict(max_cpus=2))" assert_raises(self, code, globals(), locals(), KeyError, "'min_cpus required if max_cpus specified'") code = "RAM.validate_resources(dict(min_cpus=2, max_cpus=1))" assert_raises(self, code, globals(), locals(), ValueError, "max_cpus 1 < min_cpus 2") # Must be positive. code = "RAM.validate_resources(dict(min_cpus=-2))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'min_cpus': -2") # Must be sequence. code = "RAM.validate_resources(dict(args='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'args': 'hello'") # Must be strings. code = "RAM.validate_resources(dict(args=['hello', 42]))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'args': ['hello', 42]") # Must be registered allocator. code = "RAM.validate_resources(dict(allocator='NoSuchAllocator'))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'allocator': 'NoSuchAllocator'") # Must be dict. code = "RAM.validate_resources(dict(job_environment='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment': 'hello'") # Key must be string. env = {} env[3] = 'hello' code = "RAM.validate_resources(dict(job_environment=env))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {3: 'hello'}") # Key must not have whitespace. env = {} env['hello there'] = 'world' code = "RAM.validate_resources(dict(job_environment=env))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {'hello there': 'world'}") # Value must be string. env = {} env['hello'] = 3 code = "RAM.validate_resources(dict(job_environment=env))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'job_environment':" " {'hello': 3}") # Must be dict. code = "RAM.validate_resources(dict(resource_limits='hello'))" assert_raises(self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits': 'hello'") # Must be known key. limits = {} limits['no-such-resource'] = 1 code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'no-such-resource': 1}") # Value must be int. limits = {} limits['wallclock_time'] = 'infinite' code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'wallclock_time': 'infinite'}") # Value must be >= 0. limits = {} limits['wallclock_time'] = -1 code = "RAM.validate_resources(dict(resource_limits=limits))" assert_raises( self, code, globals(), locals(), ValueError, "Invalid resource value for 'resource_limits':" " {'wallclock_time': -1}") # Must be known resource. code = "RAM.validate_resources(dict(no_such_resource=2))" assert_raises(self, code, globals(), locals(), KeyError, '"Invalid resource key \'no_such_resource\'"') def test_bad_host(self): logging.debug('') logging.debug('test_bad_host') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines.append({'hostname': 'xyzzy', 'python': self.python}) self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), len(self.machines) - 1) def test_bad_python(self): logging.debug('') logging.debug('test_bad_python') if self.skip_ssh: logging.debug(' requires ssh, skipping') return self.machines = [{'hostname': self.node, 'python': 'no-such-python'}] self.cluster = ClusterAllocator(self.name, self.machines) self.assertEqual(len(self.cluster), 0) def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse( RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue( RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0]) def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2") def test_base(self): logging.debug('') logging.debug('test_base') assert_raises(self, "ResourceAllocator('invalid-name')", globals(), locals(), NameError, "name 'invalid-name' is not alphanumeric") allocator = ResourceAllocator('dummy') self.assertEqual(allocator.name, 'dummy') # Just show they can be called. allocator.invalidate() allocator.configure('') retcode, info = \ allocator.check_compatibility({'remote_command': 'echo', 'no-such-key': True}) self.assertEqual(retcode, 0) self.assertEqual(info, ['no-such-key']) assert_raises(self, "allocator.max_servers(dict())", globals(), locals(), NotImplementedError, 'max_servers') assert_raises(self, "allocator.time_estimate(dict())", globals(), locals(), NotImplementedError, 'time_estimate') assert_raises(self, "allocator.deploy('xyzzy', dict(), dict())", globals(), locals(), NotImplementedError, 'deploy') assert_raises(self, "allocator.release(None)", globals(), locals(), NotImplementedError, 'release') def test_request(self): logging.debug('') logging.debug('test_request') assembly = Assembly() comp1 = assembly.add('comp1', ExtCode()) comp2 = assembly.add('comp2', ExtCode()) sub = assembly.add('sub', Assembly()) comp3 = sub.add('comp3', ExtCode()) comp1.resources = dict(min_cpus=10, max_cpus=10, resource_limits=dict(virtual_memory=100, cpu_time=120), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') comp2.resources = dict(max_cpus=2, resource_limits=dict(wallclock_time=1000000)) comp3.resources = dict(min_cpus=200, resource_limits=dict(virtual_memory=20, cpu_time=1000, wallclock_time=500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') req = RAM.max_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1000, wallclock_time=1000000)) logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) req = RAM.total_request(assembly) expected = dict(min_cpus=200, max_cpus=200, resource_limits=dict(virtual_memory=100, cpu_time=1120, wallclock_time=1000500), rerunnable=True, accounting_id='frobozz', queue_name='debug', job_category='MPI') logging.debug('req: %r', req) logging.debug('exp: %r', expected) self.assertEqual(req, expected) comp3.resources['accounting_id'] = 'xyzzy' assert_raises( self, 'RAM.total_request(assembly)', globals(), locals(), ValueError, "Incompatible settings for 'accounting_id':" " 'xyzzy' vs. 'frobozz'")