def resolve(self, is_head): """Returns a copy with values filled out with system defaults.""" resources = (self.resources or {}).copy() assert "CPU" not in resources, resources assert "GPU" not in resources, resources assert "memory" not in resources, resources assert "object_store_memory" not in resources, resources num_cpus = self.num_cpus if num_cpus is None: num_cpus = multiprocessing.cpu_count() num_gpus = self.num_gpus gpu_ids = ray.utils.get_cuda_visible_devices() # Check that the number of GPUs that the raylet wants doesn't # excede the amount allowed by CUDA_VISIBLE_DEVICES. if (num_gpus is not None and gpu_ids is not None and num_gpus > len(gpu_ids)): raise Exception("Attempting to start raylet with {} GPUs, " "but CUDA_VISIBLE_DEVICES contains {}.".format( num_gpus, gpu_ids)) if num_gpus is None: # Try to automatically detect the number of GPUs. num_gpus = _autodetect_num_gpus() # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES. if gpu_ids is not None: num_gpus = min(num_gpus, len(gpu_ids)) # Choose a default object store size. system_memory = ray.utils.get_system_memory() avail_memory = ray.utils.estimate_available_memory() object_store_memory = self.object_store_memory if object_store_memory is None: object_store_memory = int(avail_memory * 0.3) # Cap memory to avoid memory waste and perf issues on large nodes if (object_store_memory > ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES): logger.warning( "Warning: Capping object memory store to {}GB. ".format( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES // 1e9) + "To increase this further, specify `object_store_memory` " "when calling ray.init() or ray start.") object_store_memory = ( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) redis_max_memory = self.redis_max_memory if redis_max_memory is None: redis_max_memory = min( ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES, max(int(avail_memory * 0.1), ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES: raise ValueError( "Attempting to cap Redis memory usage at {} bytes, " "but the minimum allowed is {} bytes.".format( redis_max_memory, ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) memory = self.memory if memory is None: memory = (avail_memory - object_store_memory - (redis_max_memory if is_head else 0)) if memory < 100e6 and memory < 0.05 * system_memory: raise ValueError( "After taking into account object store and redis memory " "usage, the amount of memory on this node available for " "tasks and actors ({} GB) is less than {}% of total. " "You can adjust these settings with " "ray.init(memory=<bytes>, " "object_store_memory=<bytes>).".format( round(memory / 1e9, 2), int(100 * (memory / system_memory)))) logger.info( "Starting Ray with {} GiB memory available for workers and up to " "{} GiB for objects. You can adjust these settings " "with ray.init(memory=<bytes>, " "object_store_memory=<bytes>).".format( round( ray_constants.round_to_memory_units( memory, round_up=False) / (1024**3), 2), round(object_store_memory / (1024**3), 2))) spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory, resources, redis_max_memory) assert spec.resolved() return spec
def resolve(self, is_head, node_ip_address=None): """Returns a copy with values filled out with system defaults. Args: is_head (bool): Whether this is the head node. node_ip_address (str): The IP address of the node that we are on. This is used to automatically create a node id resource. """ resources = (self.resources or {}).copy() assert "CPU" not in resources, resources assert "GPU" not in resources, resources assert "memory" not in resources, resources assert "object_store_memory" not in resources, resources if node_ip_address is None: node_ip_address = ray.services.get_node_ip_address() # Automatically create a node id resource on each node. This is # queryable with ray.state.node_ids() and ray.state.current_node_id(). resources[NODE_ID_PREFIX + node_ip_address] = 1.0 num_cpus = self.num_cpus if num_cpus is None: num_cpus = multiprocessing.cpu_count() num_gpus = self.num_gpus gpu_ids = ray.utils.get_cuda_visible_devices() # Check that the number of GPUs that the raylet wants doesn't # excede the amount allowed by CUDA_VISIBLE_DEVICES. if (num_gpus is not None and gpu_ids is not None and num_gpus > len(gpu_ids)): raise ValueError("Attempting to start raylet with {} GPUs, " "but CUDA_VISIBLE_DEVICES contains {}.".format( num_gpus, gpu_ids)) if num_gpus is None: # Try to automatically detect the number of GPUs. num_gpus = _autodetect_num_gpus() # Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES. if gpu_ids is not None: num_gpus = min(num_gpus, len(gpu_ids)) try: info_string = _get_gpu_info_string() gpu_types = _constraints_from_gpu_info(info_string) resources.update(gpu_types) except Exception: logger.exception("Could not parse gpu information.") # Choose a default object store size. system_memory = ray.utils.get_system_memory() avail_memory = ray.utils.estimate_available_memory() object_store_memory = self.object_store_memory if object_store_memory is None: object_store_memory = int(avail_memory * 0.3) # Cap memory to avoid memory waste and perf issues on large nodes if (object_store_memory > ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES): logger.debug( "Warning: Capping object memory store to {}GB. ".format( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES // 1e9) + "To increase this further, specify `object_store_memory` " "when calling ray.init() or ray start.") object_store_memory = ( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) redis_max_memory = self.redis_max_memory if redis_max_memory is None: redis_max_memory = min( ray_constants.DEFAULT_REDIS_MAX_MEMORY_BYTES, max(int(avail_memory * 0.1), ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) if redis_max_memory < ray_constants.REDIS_MINIMUM_MEMORY_BYTES: raise ValueError( "Attempting to cap Redis memory usage at {} bytes, " "but the minimum allowed is {} bytes.".format( redis_max_memory, ray_constants.REDIS_MINIMUM_MEMORY_BYTES)) memory = self.memory if memory is None: memory = (avail_memory - object_store_memory - (redis_max_memory if is_head else 0)) if memory < 100e6 and memory < 0.05 * system_memory: raise ValueError( "After taking into account object store and redis memory " "usage, the amount of memory on this node available for " "tasks and actors ({} GB) is less than {}% of total. " "You can adjust these settings with " "ray.init(memory=<bytes>, " "object_store_memory=<bytes>).".format( round(memory / 1e9, 2), int(100 * (memory / system_memory)))) rounded_memory = ray_constants.round_to_memory_units(memory, round_up=False) worker_ram = round(rounded_memory / (1024**3), 2) object_ram = round(object_store_memory / (1024**3), 2) # TODO(maximsmol): this behavior is strange since we do not have a # good grasp on when this will get called # (you have to study node.py to make a guess) with cli_logger.group("Available RAM"): cli_logger.labeled_value("Workers", "{} GiB", str(worker_ram)) cli_logger.labeled_value("Objects", "{} GiB", str(object_ram)) cli_logger.newline() cli_logger.print("To adjust these values, use") with cf.with_style("monokai") as c: cli_logger.print( " ray{0}init(memory{1}{2}, " "object_store_memory{1}{2})", c.magenta("."), c.magenta("="), c.purple("<bytes>")) cli_logger.old_info( logger, "Starting Ray with {} GiB memory available for workers and up to " "{} GiB for objects. You can adjust these settings " "with ray.init(memory=<bytes>, " "object_store_memory=<bytes>).", worker_ram, object_ram) spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory, resources, redis_max_memory) assert spec.resolved() return spec