def test_pod_resources_add(self): config_dict1 = { 'cpu': { 'requests': 0.8, }, 'gpu': { 'requests': 2, }, 'memory': { 'requests': 200, 'limits': 300 }, } config_dict2 = { 'gpu': { 'limits': 4 }, 'memory': { 'requests': 300, 'limits': 200 }, } config1 = PodResourcesConfig.from_dict(config_dict1) config2 = PodResourcesConfig.from_dict(config_dict2) config = config1 + config2 assert config.cpu.to_dict() == {'requests': 0.8, 'limits': None} assert config.memory.to_dict() == {'requests': 500, 'limits': 500} assert config.gpu.to_dict() == {'requests': 2, 'limits': 4}
def get_total_resources(cls, master_resources, environment, cluster, is_distributed): worker_resources = cls.get_worker_resources( environment=environment, cluster=cluster, is_distributed=is_distributed, ) ps_resources = cls.get_ps_resources( environment=environment, cluster=cluster, is_distributed=is_distributed, ) if not any([master_resources, worker_resources, ps_resources]): return None total_resources = PodResourcesConfig() if master_resources: total_resources += master_resources for w_resources in six.itervalues(worker_resources or {}): total_resources += w_resources for p_resources in six.itervalues(ps_resources or {}): total_resources += p_resources return total_resources.to_dict()
def test_environment_config(self): config_dict = { 'resources': PodResourcesConfig(cpu=K8SResourcesConfig(0.5, 1)).to_dict() } config = EnvironmentConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add tensorflow config_dict['tensorflow'] = { 'n_workers': 10, 'n_ps': 5, } config = EnvironmentConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add mxnet should raise config_dict['mxnet'] = { 'n_workers': 10, 'n_ps': 5, } with self.assertRaises(ValidationError): EnvironmentConfig.from_dict(config_dict) # Removing tensorflow should pass for mxnet del config_dict['tensorflow'] config = EnvironmentConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding horovod should raise config_dict['horovod'] = {'n_workers': 5} with self.assertRaises(ValidationError): EnvironmentConfig.from_dict(config_dict) # Removing mxnet should pass for horovod del config_dict['mxnet'] config = EnvironmentConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding pytorch should raise config_dict['pytorch'] = {'n_workers': 5} with self.assertRaises(ValidationError): EnvironmentConfig.from_dict(config_dict) # Removing horovod should pass for pytorch del config_dict['horovod'] config = EnvironmentConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict())
def test_mxnet_config(self): config_dict = { 'n_workers': 10, 'n_ps': 5, } config = MXNetConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default worker resources config_dict['default_worker_resources'] = PodResourcesConfig( cpu=K8SResourcesConfig(0.5, 1), gpu=K8SResourcesConfig(2, 4)).to_dict() config = MXNetConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default ps resources config_dict['default_ps_resources'] = PodResourcesConfig( cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() config = MXNetConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom resources for worker 4 config_dict['worker_resources'] = [ PodResourcesConfig(index=4, cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() ] config = MXNetConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom resources for ps 4 config_dict['ps_resources'] = [ PodResourcesConfig(index=4, cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() ] config = MXNetConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict())
def test_pod_resources_config(self): config_dict = { 'cpu': { 'requests': 0.8, 'limits': 1 }, 'gpu': { 'requests': 2, 'limits': 4 }, 'memory': { 'requests': 265, 'limits': 512 }, } config = PodResourcesConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict())
def test_tensorflow_config(self): config_dict = { 'n_workers': 10, 'n_ps': 5, 'delay_workers_by_global_step': False } config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add run config config_dict['run_config'] = RunConfig().to_dict() config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default worker session config config_dict['default_worker_config'] = SessionConfig( intra_op_parallelism_threads=1, inter_op_parallelism_threads=3).to_dict() config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default worker resources config_dict['default_worker_resources'] = PodResourcesConfig( cpu=K8SResourcesConfig(0.5, 1), gpu=K8SResourcesConfig(2, 4)).to_dict() config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default ps session config config_dict['default_ps_config'] = SessionConfig( intra_op_parallelism_threads=0, inter_op_parallelism_threads=2).to_dict() config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Add default ps resources config_dict['default_ps_resources'] = PodResourcesConfig( cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom config for worker 3 config_dict['worker_configs'] = [ SessionConfig( index=3, gpu_options=GPUOptionsConfig(gpu_memory_fraction=0.4), intra_op_parallelism_threads=8, inter_op_parallelism_threads=8).to_dict() ] config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom resources for worker 4 config_dict['worker_resources'] = [ PodResourcesConfig(index=4, cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() ] config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom config for ps 2 config_dict['ps_configs'] = [ SessionConfig(index=2, gpu_options=GPUOptionsConfig(allow_growth=False), intra_op_parallelism_threads=1, inter_op_parallelism_threads=1).to_dict() ] config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict()) # Adding custom resources for ps 4 config_dict['ps_resources'] = [ PodResourcesConfig(index=4, cpu=K8SResourcesConfig(0.5, 1), memory=K8SResourcesConfig(256, 400)).to_dict() ] config = TensorflowConfig.from_dict(config_dict) assert_equal_dict(config_dict, config.to_dict())