def schedule_fold_model_fit(self, model_base, fold_ctx, kwargs): args = [model_base, fold_ctx, kwargs] args_refs = [ray.put(arg) for arg in args] print('...model_fit') pg = placement_group([{"CPU": 2}], strategy="STRICT_SPREAD") ray.get(pg.ready()) print(placement_group_table(pg)) results_ref = model_fit_task_ray.options(placement_group=pg).remote( *args_refs) self.jobs.append((results_ref, time_start_fold, on_fit_end_fn))
def placement_group_factory(): head_bundle = {"CPU": 1} child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} child_bundle_extra = {} if resources_per_actor is None else \ resources_per_actor child_bundles = [{ **child_bundle, **child_bundle_extra } for _ in range(num_actors)] bundles = [head_bundle] + child_bundles return placement_group(bundles, strategy="PACK")
def get_remote_worker_options( num_workers: int, num_cpus_per_worker: int, num_gpus_per_worker: int, num_workers_per_host: Optional[int], timeout_s: Optional[int], ) -> (Dict[str, Any], placement_group): """Returns the option for remote workers. Args: num_workers: Number of training workers to include in world. num_cpus_per_worker: Number of CPU resources to reserve per training worker. num_gpus_per_worker: Number of GPU resources to reserve per training worker. num_workers_per_host: Optional[int]: Number of workers to colocate per host. timeout_s: Seconds before the torch process group times out. Useful when machines are unreliable. Defaults to 60 seconds. This value is also reused for triggering placement timeouts if forcing colocation. Returns: type: option that contains CPU/GPU count of the remote worker and the placement group information. pg: return a reference to the placement group """ pg = None options = dict(num_cpus=num_cpus_per_worker, num_gpus=num_gpus_per_worker) if num_workers_per_host: num_hosts = int(num_workers / num_workers_per_host) cpus_per_node = num_cpus_per_worker * num_workers_per_host gpus_per_node = num_gpus_per_worker * num_workers_per_host bundle = {"CPU": cpus_per_node, "GPU": gpus_per_node} all_bundles = [bundle] * num_hosts pg = placement_group(all_bundles, strategy="STRICT_SPREAD") logger.debug("Waiting for placement_group to start.") ray.get(pg.ready(), timeout=timeout_s) logger.debug("Placement_group started.") options["placement_group"] = pg return options, pg
def test_mpi_with_pg(ray_cluster): pg = placement_group(bundles=[{"CPU": 2}], strategy="STRICT_SPREAD") with create_mpi_job(job_name="test", world_size=2, num_cpus_per_process=1, num_processes_per_node=2, timeout=5, mpi_type="mpich", placement_group=pg, placement_group_bundle_indexes=[0]) as job: def func(context: WorkerContext): return context.job_id results = job.run(func) assert len(results) == 2 assert results[0] == results[1] == "test" remove_placement_group(pg)
def placement_group_factory(): head_bundle = {"CPU": 4, "GPU": 0, "custom": 0} child_bundle = {"custom": 1} return placement_group([head_bundle, child_bundle, child_bundle])