def _update_avail_resources(self, num_retries=5): resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) try: resources = ray.cluster_resources() except Exception: # TODO(rliaw): Remove this when local mode is fixed. # https://github.com/ray-project/ray/issues/4147 logger.debug("Using resources for local machine.") resources = ResourceSpec().resolve(True).to_resource_dict() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources if num_gpus == 0: warnings.warn( "No GPU resources found, assuming local test, using CPU resources instead" ) # local test num_gpus = num_cpus self._fake_gpus = True else: self._fake_gpus = False avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources, ) assert (self.idle_resources.is_nonnegative() ), "Cluster removed resources from running trials!" self._avail_resources = avail_resources self._last_resource_refresh = time.time() self._resources_initialized = True
def _update_avail_resources(self, num_retries=5): if time.time() - self._last_resource_refresh < self._refresh_period: return logger.debug("Checking Ray cluster resources.") resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) try: resources = ray.cluster_resources() except Exception as exc: # TODO(rliaw): Remove this when local mode is fixed. # https://github.com/ray-project/ray/issues/4147 logger.debug(f"{exc}: Using resources for local machine.") resources = ResourceSpec().resolve(True).to_resource_dict() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources) self._last_resource_refresh = time.time() self._resources_initialized = True