Esempio n. 1
0
 def tolerations(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_tolerations = MPISpecification.get_worker_tolerations(
         environment=self.spec.config.mpi,
         cluster=cluster,
         is_distributed=is_distributed)
     return {
         TaskType.WORKER: worker_tolerations,
     }
Esempio n. 2
0
def create_mpi_experiment_jobs(experiment, spawner):
    cluster, is_distributed = spawner.spec.cluster_def
    environment = spawner.spec.config.mpi
    worker_resources = MPISpecification.get_worker_resources(
        environment=environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_node_selectors = MPISpecification.get_worker_node_selectors(
        environment=environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_affinities = MPISpecification.get_worker_affinities(
        environment=environment,
        cluster=cluster,
        is_distributed=is_distributed
    )
    worker_tolerations = MPISpecification.get_worker_tolerations(
        environment=environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker_job_uuid in enumerate(spawner.job_uuids[TaskType.WORKER]):
        if i == 0:
            create_job(job_uuid=worker_job_uuid,
                       experiment=experiment,
                       role=TaskType.WORKER,
                       resources=spawner.spec.master_resources,
                       node_selector=spawner.spec.master_node_selector,
                       affinity=spawner.spec.master_affinity,
                       tolerations=spawner.spec.master_tolerations)
        else:
            create_job(job_uuid=worker_job_uuid,
                       experiment=experiment,
                       role=TaskType.WORKER,
                       sequence=i,
                       resources=worker_resources.get(i),
                       node_selector=worker_node_selectors.get(i),
                       affinity=worker_affinities.get(i),
                       tolerations=worker_tolerations.get(i))