Esempio n. 1
0
 def resources(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_resources = TensorflowSpecification.get_worker_resources(
         environment=self.spec.config.tensorflow,
         cluster=cluster,
         is_distributed=is_distributed)
     ps_resources = TensorflowSpecification.get_ps_resources(
         environment=self.spec.config.tensorflow,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.MASTER: {
             0: self.spec.master_resources
         },
         TaskType.WORKER: worker_resources,
         TaskType.PS: ps_resources,
     }
def handle_tensorflow_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources,
               node_selector=spawner.spec.master_node_selector,
               affinity=spawner.spec.master_affinity,
               tolerations=spawner.spec.master_tolerations)

    cluster, is_distributed = spawner.spec.cluster_def

    worker_resources = TensorflowSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_node_selectors = TensorflowSpecification.get_worker_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_affinities = TensorflowSpecification.get_worker_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_tolerations = TensorflowSpecification.get_worker_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   sequence=i,
                   resources=worker_resources.get(i),
                   node_selector=worker_node_selectors.get(i),
                   affinity=worker_affinities.get(i),
                   tolerations=worker_tolerations.get(i))

    ps_resources = TensorflowSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_node_selectors = TensorflowSpecification.get_ps_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_affinities = TensorflowSpecification.get_ps_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_tolerations = TensorflowSpecification.get_ps_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, ps in enumerate(response[TaskType.PS]):
        job_uuid = ps['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(ps),
                   role=TaskType.PS,
                   sequence=i,
                   resources=ps_resources.get(i),
                   node_selector=ps_node_selectors.get(i),
                   affinity=ps_affinities.get(i),
                   tolerations=ps_tolerations.get(i))
def create_tensorflow_experiment_jobs(experiment, spawner):
    master_job_uuid = spawner.job_uuids[TaskType.MASTER][0]
    create_job(job_uuid=master_job_uuid,
               experiment=experiment,
               resources=spawner.spec.master_resources,
               node_selector=spawner.spec.master_node_selector,
               affinity=spawner.spec.master_affinity,
               tolerations=spawner.spec.master_tolerations)

    cluster, is_distributed = spawner.spec.cluster_def

    worker_resources = TensorflowSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_node_selectors = TensorflowSpecification.get_worker_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_affinities = TensorflowSpecification.get_worker_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_tolerations = TensorflowSpecification.get_worker_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker_job_uuid in enumerate(spawner.job_uuids[TaskType.WORKER]):
        create_job(job_uuid=worker_job_uuid,
                   experiment=experiment,
                   role=TaskType.WORKER,
                   sequence=i,
                   resources=worker_resources.get(i),
                   node_selector=worker_node_selectors.get(i),
                   affinity=worker_affinities.get(i),
                   tolerations=worker_tolerations.get(i))

    ps_resources = TensorflowSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    ps_node_selectors = TensorflowSpecification.get_ps_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    ps_affinities = TensorflowSpecification.get_ps_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    ps_tolerations = TensorflowSpecification.get_ps_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, ps_job_uuid in enumerate(spawner.job_uuids[TaskType.PS]):
        create_job(job_uuid=ps_job_uuid,
                   experiment=experiment,
                   role=TaskType.PS,
                   sequence=i,
                   resources=ps_resources.get(i),
                   node_selector=ps_node_selectors.get(i),
                   affinity=ps_affinities.get(i),
                   tolerations=ps_tolerations.get(i))