Esempio n. 1
0
def handle_horovod_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = HorovodSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))
def handle_horovod_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = HorovodSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))
Esempio n. 3
0
    def cluster_def(self):
        cluster = {
            TaskType.MASTER: 1,
        }
        is_distributed = False
        environment = self.environment

        if not environment:
            return cluster, is_distributed

        if environment.tensorflow:
            return TensorflowSpecification.get_cluster_def(
                cluster=cluster, tensorflow_config=environment.tensorflow)
        if environment.horovod:
            return HorovodSpecification.get_cluster_def(
                cluster=cluster, horovod_config=environment.horovod)
        if environment.mxnet:
            return MXNetSpecification.get_cluster_def(
                cluster=cluster, mxnet_config=environment.mxnet)
        if environment.pytorch:
            return PytorchSpecification.get_cluster_def(
                cluster=cluster, pytorch_config=environment.pytorch)

        # No specified framework, It should return default standalone mode cluster definition
        return cluster, is_distributed
Esempio n. 4
0
 def node_selectors(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_node_selectors = HorovodSpecification.get_worker_node_selectors(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     return {
         TaskType.MASTER: {0: self.spec.master_node_selectors},
         TaskType.WORKER: worker_node_selectors,
     }
Esempio n. 5
0
 def node_selectors(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_node_selectors = HorovodSpecification.get_worker_node_selectors(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     return {
         TaskType.MASTER: {0: self.spec.master_node_selectors},
         TaskType.WORKER: worker_node_selectors,
     }
Esempio n. 6
0
 def tolerations(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_tolerations = HorovodSpecification.get_worker_tolerations(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.MASTER: {
             0: self.spec.master_affinity
         },
         TaskType.WORKER: worker_tolerations,
     }
Esempio n. 7
0
    def total_resources(self):
        environment = self.environment

        if not environment:
            return None

        cluster, is_distributed = self.cluster_def

        # Check if any framework is defined
        if environment.tensorflow:
            return TensorflowSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.horovod:
            return HorovodSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.mxnet:
            return MXNetSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        if environment.pytorch:
            return PytorchSpecification.get_total_resources(
                master_resources=self.master_resources,
                environment=environment,
                cluster=cluster,
                is_distributed=is_distributed)

        # default value is the master resources
        return self.master_resources
Esempio n. 8
0
    def test_distributed_horovod_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath('tests/fixtures/distributed_horovod_file.yml'))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.is_runnable
        assert spec.framework == Frameworks.HOROVOD
        assert spec.environment.horovod.n_workers == 5

        assert isinstance(spec.environment.resources, PodResourcesConfig)
        assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig)
        assert spec.environment.resources.cpu.requests == 1
        assert spec.environment.resources.cpu.limits == 2

        assert isinstance(spec.environment.horovod.default_worker_resources,
                          PodResourcesConfig)
        assert isinstance(
            spec.environment.horovod.default_worker_resources.cpu,
            K8SResourcesConfig)
        assert spec.environment.horovod.default_worker_resources.cpu.requests == 3
        assert spec.environment.horovod.default_worker_resources.cpu.limits == 3
        assert isinstance(
            spec.environment.horovod.default_worker_resources.memory,
            K8SResourcesConfig)
        assert spec.environment.horovod.default_worker_resources.memory.requests == 256
        assert spec.environment.horovod.default_worker_resources.memory.limits == 256

        assert isinstance(spec.environment.horovod.worker_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.horovod.worker_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.horovod.worker_resources[0].index == 3
        assert spec.environment.horovod.worker_resources[
            0].memory.requests == 300
        assert spec.environment.horovod.worker_resources[
            0].memory.limits == 300

        # check that properties for return list of configs and resources is working
        cluster, is_distributed = spec.cluster_def
        worker_resources = HorovodSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(worker_resources) == spec.environment.horovod.n_workers
        assert set(worker_resources.values()) == {
            spec.environment.horovod.default_worker_resources,
            spec.environment.horovod.worker_resources[0]
        }

        # Check total resources
        assert spec.total_resources == {
            'cpu': {
                'requests': 1 + 3 * 4,
                'limits': 2 + 3 * 4
            },
            'memory': {
                'requests': 300 + 256 * 4,
                'limits': 300 + 256 * 4
            },
            'gpu': None
        }

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5
        }, True)