Exemple #1
0
    def resolve(self, is_head):
        """Returns a copy with values filled out with system defaults."""

        resources = (self.resources or {}).copy()
        assert "CPU" not in resources, resources
        assert "GPU" not in resources, resources
        assert "memory" not in resources, resources
        assert "object_store_memory" not in resources, resources

        num_cpus = self.num_cpus
        if num_cpus is None:
            num_cpus = multiprocessing.cpu_count()

        num_gpus = self.num_gpus
        gpu_ids = ray.utils.get_cuda_visible_devices()
        # Check that the number of GPUs that the raylet wants doesn't
        # excede the amount allowed by CUDA_VISIBLE_DEVICES.
        if (num_gpus is not None and gpu_ids is not None
                and num_gpus > len(gpu_ids)):
            raise Exception("Attempting to start raylet with {} GPUs, "
                            "but CUDA_VISIBLE_DEVICES contains {}.".format(
                                num_gpus, gpu_ids))
        if num_gpus is None:
            # Try to automatically detect the number of GPUs.
            num_gpus = _autodetect_num_gpus()
            # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
            if gpu_ids is not None:
                num_gpus = min(num_gpus, len(gpu_ids))

        # Choose a default object store size.
        system_memory = ray.utils.get_system_memory()
        avail_memory = ray.utils.estimate_available_memory()
        object_store_memory = self.object_store_memory
        if object_store_memory is None:
            object_store_memory = int(avail_memory * 0.3)
            # Cap memory to avoid memory waste and perf issues on large nodes
            if (object_store_memory >
                    ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES):
                logger.warning(
                    "Warning: Capping object memory store to {}GB. ".format(
                        ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES //
                        1e9) +
                    "To increase this further, specify `object_store_memory` "
                    "when calling ray.init() or ray start.")
                object_store_memory = (
                    ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)

        redis_max_memory = self.redis_max_memory
        if redis_max_memory is None:
            redis_max_memory = min(
                ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES,
                max(int(avail_memory * 0.1),
                    ray_constants.REDIS_MINIMUM_MEMORY_BYTES))
        if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES:
            raise ValueError(
                "Attempting to cap Redis memory usage at {} bytes, "
                "but the minimum allowed is {} bytes.".format(
                    redis_max_memory,
                    ray_constants.REDIS_MINIMUM_MEMORY_BYTES))

        memory = self.memory
        if memory is None:
            memory = (avail_memory - object_store_memory -
                      (redis_max_memory if is_head else 0))
            if memory < 100e6 and memory < 0.05 * system_memory:
                raise ValueError(
                    "After taking into account object store and redis memory "
                    "usage, the amount of memory on this node available for "
                    "tasks and actors ({} GB) is less than {}% of total. "
                    "You can adjust these settings with "
                    "ray.init(memory=<bytes>, "
                    "object_store_memory=<bytes>).".format(
                        round(memory / 1e9, 2),
                        int(100 * (memory / system_memory))))

        logger.info(
            "Starting Ray with {} GiB memory available for workers and up to "
            "{} GiB for objects. You can adjust these settings "
            "with ray.init(memory=<bytes>, "
            "object_store_memory=<bytes>).".format(
                round(
                    ray_constants.round_to_memory_units(
                        memory, round_up=False) / (1024**3), 2),
                round(object_store_memory / (1024**3), 2)))

        spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory,
                            resources, redis_max_memory)
        assert spec.resolved()
        return spec
Exemple #2
0
    def resolve(self, is_head, node_ip_address=None):
        """Returns a copy with values filled out with system defaults.

        Args:
            is_head (bool): Whether this is the head node.
            node_ip_address (str): The IP address of the node that we are on.
                This is used to automatically create a node id resource.
        """

        resources = (self.resources or {}).copy()
        assert "CPU" not in resources, resources
        assert "GPU" not in resources, resources
        assert "memory" not in resources, resources
        assert "object_store_memory" not in resources, resources

        if node_ip_address is None:
            node_ip_address = ray.services.get_node_ip_address()

        # Automatically create a node id resource on each node. This is
        # queryable with ray.state.node_ids() and ray.state.current_node_id().
        resources[NODE_ID_PREFIX + node_ip_address] = 1.0

        num_cpus = self.num_cpus
        if num_cpus is None:
            num_cpus = multiprocessing.cpu_count()

        num_gpus = self.num_gpus
        gpu_ids = ray.utils.get_cuda_visible_devices()
        # Check that the number of GPUs that the raylet wants doesn't
        # excede the amount allowed by CUDA_VISIBLE_DEVICES.
        if (num_gpus is not None and gpu_ids is not None
                and num_gpus > len(gpu_ids)):
            raise ValueError("Attempting to start raylet with {} GPUs, "
                             "but CUDA_VISIBLE_DEVICES contains {}.".format(
                                 num_gpus, gpu_ids))
        if num_gpus is None:
            # Try to automatically detect the number of GPUs.
            num_gpus = _autodetect_num_gpus()
            # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
            if gpu_ids is not None:
                num_gpus = min(num_gpus, len(gpu_ids))

        try:
            info_string = _get_gpu_info_string()
            gpu_types = _constraints_from_gpu_info(info_string)
            resources.update(gpu_types)
        except Exception:
            logger.exception("Could not parse gpu information.")

        # Choose a default object store size.
        system_memory = ray.utils.get_system_memory()
        avail_memory = ray.utils.estimate_available_memory()
        object_store_memory = self.object_store_memory
        if object_store_memory is None:
            object_store_memory = int(avail_memory * 0.3)
            # Cap memory to avoid memory waste and perf issues on large nodes
            if (object_store_memory >
                    ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES):
                logger.debug(
                    "Warning: Capping object memory store to {}GB. ".format(
                        ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES //
                        1e9) +
                    "To increase this further, specify `object_store_memory` "
                    "when calling ray.init() or ray start.")
                object_store_memory = (
                    ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)

        redis_max_memory = self.redis_max_memory
        if redis_max_memory is None:
            redis_max_memory = min(
                ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES,
                max(int(avail_memory * 0.1),
                    ray_constants.REDIS_MINIMUM_MEMORY_BYTES))
        if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES:
            raise ValueError(
                "Attempting to cap Redis memory usage at {} bytes, "
                "but the minimum allowed is {} bytes.".format(
                    redis_max_memory,
                    ray_constants.REDIS_MINIMUM_MEMORY_BYTES))

        memory = self.memory
        if memory is None:
            memory = (avail_memory - object_store_memory -
                      (redis_max_memory if is_head else 0))
            if memory < 100e6 and memory < 0.05 * system_memory:
                raise ValueError(
                    "After taking into account object store and redis memory "
                    "usage, the amount of memory on this node available for "
                    "tasks and actors ({} GB) is less than {}% of total. "
                    "You can adjust these settings with "
                    "ray.init(memory=<bytes>, "
                    "object_store_memory=<bytes>).".format(
                        round(memory / 1e9, 2),
                        int(100 * (memory / system_memory))))

        rounded_memory = ray_constants.round_to_memory_units(memory,
                                                             round_up=False)
        worker_ram = round(rounded_memory / (1024**3), 2)
        object_ram = round(object_store_memory / (1024**3), 2)

        # TODO(maximsmol): this behavior is strange since we do not have a
        # good grasp on when this will get called
        # (you have to study node.py to make a guess)
        with cli_logger.group("Available RAM"):
            cli_logger.labeled_value("Workers", "{} GiB", str(worker_ram))
            cli_logger.labeled_value("Objects", "{} GiB", str(object_ram))
            cli_logger.newline()
            cli_logger.print("To adjust these values, use")
            with cf.with_style("monokai") as c:
                cli_logger.print(
                    "  ray{0}init(memory{1}{2}, "
                    "object_store_memory{1}{2})", c.magenta("."),
                    c.magenta("="), c.purple("<bytes>"))

        cli_logger.old_info(
            logger,
            "Starting Ray with {} GiB memory available for workers and up to "
            "{} GiB for objects. You can adjust these settings "
            "with ray.init(memory=<bytes>, "
            "object_store_memory=<bytes>).", worker_ram, object_ram)

        spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory,
                            resources, redis_max_memory)
        assert spec.resolved()
        return spec