Beispiel #1
0
    def cluster_def(self):
        cluster = {
            TaskType.MASTER: 1,
        }
        is_distributed = False
        environment = self.environment

        if not environment:
            return cluster, is_distributed

        if environment.tensorflow:
            return TensorflowSpecification.get_cluster_def(
                cluster=cluster, tensorflow_config=environment.tensorflow)
        if environment.horovod:
            return HorovodSpecification.get_cluster_def(
                cluster=cluster, horovod_config=environment.horovod)
        if environment.mxnet:
            return MXNetSpecification.get_cluster_def(
                cluster=cluster, mxnet_config=environment.mxnet)
        if environment.pytorch:
            return PytorchSpecification.get_cluster_def(
                cluster=cluster, pytorch_config=environment.pytorch)

        # No specified framework, It should return default standalone mode cluster definition
        return cluster, is_distributed
Beispiel #2
0
 def node_selectors(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_node_selectors = TensorflowSpecification.get_worker_node_selectors(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     ps_node_selectors = TensorflowSpecification.get_ps_node_selectors(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.MASTER: {
             0: self.spec.master_node_selectors
         },
         TaskType.WORKER: worker_node_selectors,
         TaskType.PS: ps_node_selectors,
     }
Beispiel #3
0
 def affinities(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_affinities = TensorflowSpecification.get_worker_affinities(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     ps_affinities = TensorflowSpecification.get_ps_affinities(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.MASTER: {
             0: self.spec.master_affinity
         },
         TaskType.WORKER: worker_affinities,
         TaskType.PS: ps_affinities,
     }
def handle_tensorflow_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = TensorflowSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    ps_resources = TensorflowSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))

    for i, ps in enumerate(response[TaskType.PS]):
        job_uuid = ps['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(ps),
                   role=TaskType.PS,
                   resources=ps_resources.get(i))
Beispiel #5
0
def handle_tensorflow_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = TensorflowSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    ps_resources = TensorflowSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))

    for i, ps in enumerate(response[TaskType.PS]):
        job_uuid = ps['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(ps),
                   role=TaskType.PS,
                   resources=ps_resources.get(i))
Beispiel #6
0
    def total_resources(self):
        environment = self.environment

        if not environment:
            return None

        cluster, is_distributed = self.cluster_def

        # Check if any framework is defined
        if environment.tensorflow:
            return TensorflowSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.horovod:
            return HorovodSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.mxnet:
            return MXNetSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.pytorch:
            return PytorchSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        # default value is the master resources
        return self.master_resources
Beispiel #7
0
    def test_advanced_file_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath('tests/fixtures/advanced_file.yml'))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert spec.is_runnable
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.framework == Frameworks.TENSORFLOW
        assert spec.environment.tensorflow.n_workers == 5
        assert spec.environment.tensorflow.n_ps == 10
        assert spec.environment.tensorflow.delay_workers_by_global_step is True
        assert isinstance(spec.environment.tensorflow.run_config, RunConfig)
        assert spec.environment.tensorflow.run_config.tf_random_seed == 100
        assert spec.environment.tensorflow.run_config.save_summary_steps == 100
        assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60
        assert isinstance(spec.environment.tensorflow.run_config.session,
                          SessionConfig)
        assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True
        assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2
        assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2

        # check properties for returning worker configs and resources
        assert spec.environment.tensorflow.worker_configs is None
        assert spec.environment.tensorflow.ps_configs is None
        assert spec.environment.tensorflow.worker_resources is None
        assert spec.environment.tensorflow.ps_resources is None

        cluster, is_distributed = spec.cluster_def

        assert TensorflowSpecification.get_worker_configs(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}
        assert TensorflowSpecification.get_ps_configs(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}
        assert TensorflowSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}
        assert TensorflowSpecification.get_ps_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5,
            TaskType.PS: 10
        }, True)

        assert isinstance(spec.model, ClassifierConfig)
        assert isinstance(spec.model.loss, MeanSquaredErrorConfig)
        assert isinstance(spec.model.optimizer, AdamConfig)
        assert spec.model.optimizer.learning_rate == 0.21
        assert isinstance(spec.model.graph, GraphConfig)
        assert len(spec.model.graph.layers) == 7
        assert spec.model.graph.input_layers == [['images', 0, 0]]
        assert len(spec.model.graph.output_layers) == 3
        assert ['super_dense', 0, 0] in spec.model.graph.output_layers
        assert isinstance(spec.train.data_pipeline,
                          TFRecordImagePipelineConfig)
        assert len(spec.train.data_pipeline.feature_processors.
                   feature_processors) == 1
        assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig)
        assert spec.eval.data_pipeline.feature_processors is None
Beispiel #8
0
    def test_distributed_tensorflow_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath('tests/fixtures/distributed_tensorflow_file.yml'))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.is_runnable
        assert spec.framework == Frameworks.TENSORFLOW
        assert spec.environment.tensorflow.n_workers == 5
        assert spec.environment.tensorflow.n_ps == 10

        assert isinstance(spec.environment.resources, PodResourcesConfig)
        assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig)
        assert spec.environment.resources.cpu.requests == 1
        assert spec.environment.resources.cpu.limits == 2

        assert isinstance(spec.environment.tensorflow.default_worker_resources,
                          PodResourcesConfig)
        assert isinstance(
            spec.environment.tensorflow.default_worker_resources.cpu,
            K8SResourcesConfig)
        assert spec.environment.tensorflow.default_worker_resources.cpu.requests == 3
        assert spec.environment.tensorflow.default_worker_resources.cpu.limits == 3
        assert isinstance(
            spec.environment.tensorflow.default_worker_resources.memory,
            K8SResourcesConfig)
        assert spec.environment.tensorflow.default_worker_resources.memory.requests == 256
        assert spec.environment.tensorflow.default_worker_resources.memory.limits == 256

        assert isinstance(spec.environment.tensorflow.worker_resources[0],
                          PodResourcesConfig)
        assert isinstance(
            spec.environment.tensorflow.worker_resources[0].memory,
            K8SResourcesConfig)
        assert spec.environment.tensorflow.worker_resources[0].index == 3
        assert spec.environment.tensorflow.worker_resources[
            0].memory.requests == 300
        assert spec.environment.tensorflow.worker_resources[
            0].memory.limits == 300

        assert isinstance(spec.environment.tensorflow.default_ps_resources,
                          PodResourcesConfig)
        assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu,
                          K8SResourcesConfig)
        assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2
        assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4

        assert isinstance(spec.environment.tensorflow.ps_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.tensorflow.ps_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.tensorflow.ps_resources[0].index == 9
        assert spec.environment.tensorflow.ps_resources[
            0].memory.requests == 512
        assert spec.environment.tensorflow.ps_resources[
            0].memory.limits == 1024

        # check that properties for return list of configs and resources is working
        cluster, is_distributed = spec.cluster_def
        worker_resources = TensorflowSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(worker_resources) == spec.environment.tensorflow.n_workers
        assert set(worker_resources.values()) == {
            spec.environment.tensorflow.default_worker_resources,
            spec.environment.tensorflow.worker_resources[0]
        }

        ps_resources = TensorflowSpecification.get_ps_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(ps_resources) == spec.environment.tensorflow.n_ps
        assert set(ps_resources.values()) == {
            spec.environment.tensorflow.default_ps_resources,
            spec.environment.tensorflow.ps_resources[0]
        }

        # Check total resources
        assert spec.total_resources == {
            'cpu': {
                'requests': 1 + 3 * 4 + 2 * 9,
                'limits': 2 + 3 * 4 + 4 * 9
            },
            'memory': {
                'requests': 300 + 256 * 4 + 512,
                'limits': 300 + 256 * 4 + 1024
            },
            'gpu': None
        }

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5,
            TaskType.PS: 10
        }, True)
Beispiel #9
0
    def test_advanced_file_with_custom_configs_and_resources_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath(
                'tests/fixtures/advanced_file_with_custom_configs_and_resources.yml'
            ))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.is_runnable
        assert spec.framework == Frameworks.TENSORFLOW
        assert spec.environment.tensorflow.n_workers == 5
        assert spec.environment.tensorflow.n_ps == 10
        assert spec.environment.tensorflow.delay_workers_by_global_step is True
        assert isinstance(spec.environment.tensorflow.run_config, RunConfig)
        assert spec.environment.tensorflow.run_config.tf_random_seed == 100
        assert spec.environment.tensorflow.run_config.save_summary_steps == 100
        assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60

        assert isinstance(spec.environment.resources, PodResourcesConfig)
        assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig)
        assert spec.environment.resources.cpu.requests == 1
        assert spec.environment.resources.cpu.limits == 2

        assert isinstance(spec.environment.tensorflow.run_config.session,
                          SessionConfig)
        assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True
        assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2
        assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2

        assert isinstance(spec.environment.tensorflow.default_worker_config,
                          SessionConfig)
        assert spec.environment.tensorflow.default_worker_config.allow_soft_placement is True
        assert spec.environment.tensorflow.default_worker_config.intra_op_parallelism_threads == 1
        assert spec.environment.tensorflow.default_worker_config.inter_op_parallelism_threads == 1

        assert isinstance(spec.environment.tensorflow.worker_configs[0],
                          SessionConfig)
        assert spec.environment.tensorflow.worker_configs[0].index == 3
        assert spec.environment.tensorflow.worker_configs[
            0].allow_soft_placement is False
        assert spec.environment.tensorflow.worker_configs[
            0].intra_op_parallelism_threads == 5
        assert spec.environment.tensorflow.worker_configs[
            0].inter_op_parallelism_threads == 5

        assert spec.environment.tensorflow.ps_configs is None

        assert spec.environment.tensorflow.worker_resources is None

        assert isinstance(spec.environment.tensorflow.default_ps_resources,
                          PodResourcesConfig)
        assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu,
                          K8SResourcesConfig)
        assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2
        assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4

        assert isinstance(spec.environment.tensorflow.ps_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.tensorflow.ps_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.tensorflow.ps_resources[0].index == 9
        assert spec.environment.tensorflow.ps_resources[
            0].memory.requests == 512
        assert spec.environment.tensorflow.ps_resources[
            0].memory.limits == 1024

        # check that properties for return list of configs and resources is working
        cluster, is_distributed = spec.cluster_def
        worker_configs = TensorflowSpecification.get_worker_configs(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(worker_configs) == spec.environment.tensorflow.n_workers
        assert set(worker_configs.values()) == {
            spec.environment.tensorflow.default_worker_config,
            spec.environment.tensorflow.worker_configs[0]
        }
        assert TensorflowSpecification.get_ps_configs(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}

        assert TensorflowSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed) == {}
        ps_resources = TensorflowSpecification.get_ps_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(ps_resources) == spec.environment.tensorflow.n_ps
        assert set(ps_resources.values()) == {
            spec.environment.tensorflow.default_ps_resources,
            spec.environment.tensorflow.ps_resources[0]
        }

        # Check total resources
        assert spec.total_resources == {
            'cpu': {
                'requests': 1 + 2 * 9,
                'limits': 2 + 4 * 9
            },
            'memory': {
                'requests': 512,
                'limits': 1024
            },
            'gpu': None
        }

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5,
            TaskType.PS: 10
        }, True)

        assert isinstance(spec.model, ClassifierConfig)
        assert isinstance(spec.model.loss, MeanSquaredErrorConfig)
        assert isinstance(spec.model.optimizer, AdamConfig)
        assert spec.model.optimizer.learning_rate == 0.21
        assert isinstance(spec.model.graph, GraphConfig)
        assert len(spec.model.graph.layers) == 7
        assert spec.model.graph.input_layers == [['images', 0, 0]]
        assert len(spec.model.graph.output_layers) == 3
        assert ['super_dense', 0, 0] in spec.model.graph.output_layers
        assert isinstance(spec.train.data_pipeline,
                          TFRecordImagePipelineConfig)
        assert len(spec.train.data_pipeline.feature_processors.
                   feature_processors) == 1
        assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig)
        assert spec.eval.data_pipeline.feature_processors is None