class WorkQueue: def __init__(self, max_depth: int = 8): self._queue = Queue(maxsize=max_depth) def get_queue(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue def empty(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue.empty() def group(self, labels_all: np.ndarray, probs_all: np.ndarray, filename: str, original_shape: tuple, inference_time_sec: float, page_number: int) -> dict: return { "labels_all": labels_all, "probs_all": probs_all, "filename": filename, "original_shape": original_shape, "inference_time_sec": inference_time_sec, "page_number": page_number } def ungroup(self, dictionary): """ use this like: labels_all, probs_all, filename, original_shape = ungroup(d) :param dictionary: :return: """ return dictionary["labels_all"], dictionary["probs_all"], dictionary[ "filename"], dictionary["original_shape"], dictionary[ "inference_time_sec"], dictionary["page_number"] def push(self, dictionary): """ Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when queue has enough space. :param dictionary: a dictionary created with group() method. :return: None """ # put in object store ref = ray.put(dictionary) # put ref in queue self._queue.put(ref) return None def pop(self): """ :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually. """ return self._queue.get()
def _train(params: Dict, dtrain: RayDMatrix, *args, evals=(), num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = -1, resources_per_actor: Optional[Dict] = None, checkpoint_prefix: Optional[str] = None, checkpoint_path: str = "/tmp", checkpoint_frequency: int = 5, **kwargs) -> Tuple[xgb.Booster, Dict, Dict]: _assert_ray_support() if not ray.is_initialized(): ray.init() if gpus_per_actor == -1: gpus_per_actor = 0 if "tree_method" in params and params["tree_method"].startswith("gpu"): gpus_per_actor = 1 if cpus_per_actor <= 0: cluster_cpus = _ray_get_cluster_cpus() or 1 cpus_per_actor = min(int(_get_max_node_cpus() or 1), int(cluster_cpus // num_actors)) if "nthread" in params: if params["nthread"] > cpus_per_actor: raise ValueError( "Specified number of threads greater than number of CPUs. " "\nFIX THIS by passing a lower value for the `nthread` " "parameter or a higher number for `cpus_per_actor`.") else: params["nthread"] = cpus_per_actor # Create queue for communication from worker to caller. # Always create queue. queue = Queue() # Create remote actors actors = [ _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor, resources_per_actor, queue, checkpoint_prefix, checkpoint_path, checkpoint_frequency) for i in range(num_actors) ] logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.") # Split data across workers wait_load = [] for _, actor in enumerate(actors): wait_load.extend(_trigger_data_load(actor, dtrain, evals)) try: ray.get(wait_load) except Exception: _shutdown(actors, queue, force=True) raise logger.info("[RayXGBoost] Starting XGBoost training.") # Start tracker env = _start_rabit_tracker(num_actors) rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs) for actor in actors ] callback_returns = [list() for _ in range(len(actors))] try: not_ready = fut while not_ready: if queue: while not queue.empty(): (actor_rank, item) = queue.get() if isinstance(item, Callable): item() else: callback_returns[actor_rank].append(item) ready, not_ready = ray.wait(not_ready, timeout=0) logger.debug("[RayXGBoost] Waiting for results...") ray.get(ready) # Once everything is ready ray.get(fut) # The inner loop should catch all exceptions except Exception: _shutdown(remote_workers=actors, queue=queue, force=True) raise # All results should be the same because of Rabit tracking. So we just # return the first one. res: Dict[str, Any] = ray.get(fut[0]) bst = res["bst"] evals_result = res["evals_result"] additional_results = {} if callback_returns: additional_results["callback_returns"] = callback_returns all_res = ray.get(fut) total_n = sum(res["train_n"] or 0 for res in all_res) logger.info(f"[RayXGBoost] Finished XGBoost training on training data " f"with total N={total_n:,}.") if checkpoint_prefix: _cleanup(checkpoint_prefix, checkpoint_path, num_actors) _shutdown(remote_workers=actors, queue=queue, force=False) return bst, evals_result, additional_results