Beispiel #1
0
 def resources(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_resources = MXNetSpecification.get_worker_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     ps_resources = MXNetSpecification.get_ps_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.MASTER: {
             0: self.spec.master_resources
         },
         TaskType.WORKER: worker_resources,
         TaskType.SERVER: ps_resources,
     }
Beispiel #2
0
 def resources(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_resources = MXNetSpecification.get_worker_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     ps_resources = MXNetSpecification.get_ps_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     return {
         TaskType.MASTER: {0: self.spec.master_resources},
         TaskType.WORKER: worker_resources,
         TaskType.SERVER: ps_resources,
     }
def handle_mxnet_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = MXNetSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    server_resources = MXNetSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))

    for i, server in enumerate(response[TaskType.SERVER]):
        job_uuid = server['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(server),
                   role=TaskType.SERVER,
                   resources=server_resources.get(i))
Beispiel #4
0
def handle_mxnet_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = MXNetSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    server_resources = MXNetSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))

    for i, server in enumerate(response[TaskType.SERVER]):
        job_uuid = server['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(server),
                   role=TaskType.SERVER,
                   resources=server_resources.get(i))
Beispiel #5
0
    def test_distributed_mxnet_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath('tests/fixtures/distributed_mxnet_file.yml'))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.is_runnable
        assert spec.framework == Frameworks.MXNET
        assert spec.environment.mxnet.n_workers == 5
        assert spec.environment.mxnet.n_ps == 10

        assert isinstance(spec.environment.resources, PodResourcesConfig)
        assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig)
        assert spec.environment.resources.cpu.requests == 1
        assert spec.environment.resources.cpu.limits == 2

        assert isinstance(spec.environment.mxnet.default_worker_resources,
                          PodResourcesConfig)
        assert isinstance(spec.environment.mxnet.default_worker_resources.cpu,
                          K8SResourcesConfig)
        assert spec.environment.mxnet.default_worker_resources.cpu.requests == 3
        assert spec.environment.mxnet.default_worker_resources.cpu.limits == 3
        assert isinstance(
            spec.environment.mxnet.default_worker_resources.memory,
            K8SResourcesConfig)
        assert spec.environment.mxnet.default_worker_resources.memory.requests == 256
        assert spec.environment.mxnet.default_worker_resources.memory.limits == 256

        assert isinstance(spec.environment.mxnet.worker_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.mxnet.worker_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.mxnet.worker_resources[0].index == 3
        assert spec.environment.mxnet.worker_resources[
            0].memory.requests == 300
        assert spec.environment.mxnet.worker_resources[0].memory.limits == 300

        assert isinstance(spec.environment.mxnet.default_ps_resources,
                          PodResourcesConfig)
        assert isinstance(spec.environment.mxnet.default_ps_resources.cpu,
                          K8SResourcesConfig)
        assert spec.environment.mxnet.default_ps_resources.cpu.requests == 2
        assert spec.environment.mxnet.default_ps_resources.cpu.limits == 4

        assert isinstance(spec.environment.mxnet.ps_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.mxnet.ps_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.mxnet.ps_resources[0].index == 9
        assert spec.environment.mxnet.ps_resources[0].memory.requests == 512
        assert spec.environment.mxnet.ps_resources[0].memory.limits == 1024

        # check that properties for return list of configs and resources is working
        cluster, is_distributed = spec.cluster_def
        worker_resources = MXNetSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(worker_resources) == spec.environment.mxnet.n_workers
        assert set(worker_resources.values()) == {
            spec.environment.mxnet.default_worker_resources,
            spec.environment.mxnet.worker_resources[0]
        }

        ps_resources = MXNetSpecification.get_ps_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(ps_resources) == spec.environment.mxnet.n_ps
        assert set(ps_resources.values()) == {
            spec.environment.mxnet.default_ps_resources,
            spec.environment.mxnet.ps_resources[0]
        }

        # Check total resources
        assert spec.total_resources == {
            'cpu': {
                'requests': 1 + 3 * 4 + 2 * 9,
                'limits': 2 + 3 * 4 + 4 * 9
            },
            'memory': {
                'requests': 300 + 256 * 4 + 512,
                'limits': 300 + 256 * 4 + 1024
            },
            'gpu': None
        }

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5,
            TaskType.SERVER: 10
        }, True)