Ejemplo n.º 1
0
    def _update_avail_resources(self, num_retries=5):
        resources = None
        for i in range(num_retries):
            if i > 0:
                logger.warning(
                    "Cluster resources not detected or are 0. Attempt #"
                    "%s...", i + 1)
                time.sleep(0.5)
            try:
                resources = ray.cluster_resources()
            except Exception:
                # TODO(rliaw): Remove this when local mode is fixed.
                # https://github.com/ray-project/ray/issues/4147
                logger.debug("Using resources for local machine.")
                resources = ResourceSpec().resolve(True).to_resource_dict()
            if resources:
                break

        if not resources:
            # NOTE: This hides the possibility that Ray may be waiting for
            # clients to connect.
            resources.setdefault("CPU", 0)
            resources.setdefault("GPU", 0)
            logger.warning("Cluster resources cannot be detected or are 0. "
                           "You can resume this experiment by passing in "
                           "`resume=True` to `run`.")

        resources = resources.copy()
        num_cpus = resources.pop("CPU", 0)
        num_gpus = resources.pop("GPU", 0)
        memory = ray_constants.from_memory_units(resources.pop("memory", 0))
        object_store_memory = ray_constants.from_memory_units(
            resources.pop("object_store_memory", 0))
        custom_resources = resources

        if num_gpus == 0:
            warnings.warn(
                "No GPU resources found, assuming local test, using CPU resources instead"
            )
            # local test
            num_gpus = num_cpus
            self._fake_gpus = True
        else:
            self._fake_gpus = False

        avail_resources = Resources(
            int(num_cpus),
            int(num_gpus),
            memory=int(memory),
            object_store_memory=int(object_store_memory),
            custom_resources=custom_resources,
        )

        assert (self.idle_resources.is_nonnegative()
                ), "Cluster removed resources from running trials!"

        self._avail_resources = avail_resources
        self._last_resource_refresh = time.time()
        self._resources_initialized = True
Ejemplo n.º 2
0
    def get_resource_spec(self):
        """Resolve and return the current resource spec for the node."""
        def merge_resources(env_dict, params_dict):
            """Merge two dictionaries, picking from the second in the event of a conflict.
            Also emit a warning on every conflict.
            """
            result = params_dict.copy()
            result.update(env_dict)

            for key in set(env_dict.keys()).intersection(
                    set(params_dict.keys())):
                logger.warning("Autoscaler is overriding your resource:"
                               "{}: {} with {}.".format(
                                   key, params_dict[key], env_dict[key]))
            return result

        env_resources = {}
        env_string = os.getenv(ray_constants.RESOURCES_ENVIRONMENT_VARIABLE)
        if env_string:
            env_resources = json.loads(env_string)

        if not self._resource_spec:
            resources = merge_resources(env_resources,
                                        self._ray_params.resources)
            self._resource_spec = ResourceSpec(
                self._ray_params.num_cpus, self._ray_params.num_gpus,
                self._ray_params.memory, self._ray_params.object_store_memory,
                resources, self._ray_params.redis_max_memory).resolve(
                    is_head=self.head, node_ip_address=self.node_ip_address)
        return self._resource_spec