def cluster_def(self): cluster = { TaskType.MASTER: 1, } is_distributed = False environment = self.environment if not environment: return cluster, is_distributed if environment.tensorflow: return TensorflowSpecification.get_cluster_def( cluster=cluster, tensorflow_config=environment.tensorflow) if environment.horovod: return HorovodSpecification.get_cluster_def( cluster=cluster, horovod_config=environment.horovod) if environment.mxnet: return MXNetSpecification.get_cluster_def( cluster=cluster, mxnet_config=environment.mxnet) if environment.pytorch: return PytorchSpecification.get_cluster_def( cluster=cluster, pytorch_config=environment.pytorch) # No specified framework, It should return default standalone mode cluster definition return cluster, is_distributed
def node_selectors(self): cluster, is_distributed, = self.spec.cluster_def worker_node_selectors = TensorflowSpecification.get_worker_node_selectors( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_node_selectors = TensorflowSpecification.get_ps_node_selectors( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_node_selectors }, TaskType.WORKER: worker_node_selectors, TaskType.PS: ps_node_selectors, }
def affinities(self): cluster, is_distributed, = self.spec.cluster_def worker_affinities = TensorflowSpecification.get_worker_affinities( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_affinities = TensorflowSpecification.get_ps_affinities( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_affinity }, TaskType.WORKER: worker_affinities, TaskType.PS: ps_affinities, }
def handle_tensorflow_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources) cluster, is_distributed, = spawner.spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) ps_resources = TensorflowSpecification.get_ps_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, resources=worker_resources.get(i)) for i, ps in enumerate(response[TaskType.PS]): job_uuid = ps['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(ps), role=TaskType.PS, resources=ps_resources.get(i))
def total_resources(self): environment = self.environment if not environment: return None cluster, is_distributed = self.cluster_def # Check if any framework is defined if environment.tensorflow: return TensorflowSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.horovod: return HorovodSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.mxnet: return MXNetSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.pytorch: return PytorchSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) # default value is the master resources return self.master_resources
def test_advanced_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/advanced_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert spec.is_runnable assert isinstance(spec.environment, EnvironmentConfig) assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 # check properties for returning worker configs and resources assert spec.environment.tensorflow.worker_configs is None assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert spec.environment.tensorflow.ps_resources is None cluster, is_distributed = spec.cluster_def assert TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None
def test_distributed_tensorflow_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/distributed_tensorflow_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.tensorflow.default_worker_resources, PodResourcesConfig) assert isinstance( spec.environment.tensorflow.default_worker_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_worker_resources.cpu.requests == 3 assert spec.environment.tensorflow.default_worker_resources.cpu.limits == 3 assert isinstance( spec.environment.tensorflow.default_worker_resources.memory, K8SResourcesConfig) assert spec.environment.tensorflow.default_worker_resources.memory.requests == 256 assert spec.environment.tensorflow.default_worker_resources.memory.limits == 256 assert isinstance(spec.environment.tensorflow.worker_resources[0], PodResourcesConfig) assert isinstance( spec.environment.tensorflow.worker_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.worker_resources[0].index == 3 assert spec.environment.tensorflow.worker_resources[ 0].memory.requests == 300 assert spec.environment.tensorflow.worker_resources[ 0].memory.limits == 300 assert isinstance(spec.environment.tensorflow.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2 assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.tensorflow.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.tensorflow.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.ps_resources[0].index == 9 assert spec.environment.tensorflow.ps_resources[ 0].memory.requests == 512 assert spec.environment.tensorflow.ps_resources[ 0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_resources) == spec.environment.tensorflow.n_workers assert set(worker_resources.values()) == { spec.environment.tensorflow.default_worker_resources, spec.environment.tensorflow.worker_resources[0] } ps_resources = TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.tensorflow.n_ps assert set(ps_resources.values()) == { spec.environment.tensorflow.default_ps_resources, spec.environment.tensorflow.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 3 * 4 + 2 * 9, 'limits': 2 + 3 * 4 + 4 * 9 }, 'memory': { 'requests': 300 + 256 * 4 + 512, 'limits': 300 + 256 * 4 + 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True)
def test_advanced_file_with_custom_configs_and_resources_passes(self): plxfile = PolyaxonFile( os.path.abspath( 'tests/fixtures/advanced_file_with_custom_configs_and_resources.yml' )) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 assert isinstance(spec.environment.tensorflow.default_worker_config, SessionConfig) assert spec.environment.tensorflow.default_worker_config.allow_soft_placement is True assert spec.environment.tensorflow.default_worker_config.intra_op_parallelism_threads == 1 assert spec.environment.tensorflow.default_worker_config.inter_op_parallelism_threads == 1 assert isinstance(spec.environment.tensorflow.worker_configs[0], SessionConfig) assert spec.environment.tensorflow.worker_configs[0].index == 3 assert spec.environment.tensorflow.worker_configs[ 0].allow_soft_placement is False assert spec.environment.tensorflow.worker_configs[ 0].intra_op_parallelism_threads == 5 assert spec.environment.tensorflow.worker_configs[ 0].inter_op_parallelism_threads == 5 assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert isinstance(spec.environment.tensorflow.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2 assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.tensorflow.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.tensorflow.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.ps_resources[0].index == 9 assert spec.environment.tensorflow.ps_resources[ 0].memory.requests == 512 assert spec.environment.tensorflow.ps_resources[ 0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_configs = TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_configs) == spec.environment.tensorflow.n_workers assert set(worker_configs.values()) == { spec.environment.tensorflow.default_worker_config, spec.environment.tensorflow.worker_configs[0] } assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} ps_resources = TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.tensorflow.n_ps assert set(ps_resources.values()) == { spec.environment.tensorflow.default_ps_resources, spec.environment.tensorflow.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 2 * 9, 'limits': 2 + 4 * 9 }, 'memory': { 'requests': 512, 'limits': 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None