def request_resources(num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None) -> None: """Remotely request some CPU or GPU resources from the autoscaler. This function is to be called e.g. on a node before submitting a bunch of ray.remote calls to ensure that resources rapidly become available. Args: num_cpus (int): Scale the cluster to ensure this number of CPUs are available. This request is persistent until another call to request_resources() is made. bundles (List[ResourceDict]): Scale the cluster to ensure this set of resource shapes can fit. This request is persistent until another call to request_resources() is made. """ if not ray.is_initialized(): raise RuntimeError("Ray is not initialized yet") to_request = [] if num_cpus: to_request += [{"CPU": 1}] * num_cpus if bundles: to_request += bundles _internal_kv_put( AUTOSCALER_RESOURCE_REQUEST_CHANNEL, json.dumps(to_request), overwrite=True)
def _handle_failure(self, error): logger.exception("Error in monitor loop") if ( self.autoscaler is not None and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1" ): self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True ) gcs_publisher = GcsPublisher(address=args.gcs_address) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, gcs_publisher=gcs_publisher, )
def register_actor(name, actor_handle): """Register a named actor under a string key. Args: name: The name of the named actor. actor_handle: The actor object to be associated with this name """ if not isinstance(name, str): raise TypeError("The name argument must be a string.") if not isinstance(actor_handle, ray.actor.ActorHandle): raise TypeError("The actor_handle argument must be an ActorHandle " "object.") actor_name = _calculate_key(name) # First check if the actor already exists. try: get_actor(name) exists = True except ValueError: exists = False if exists: raise ValueError("An actor with name={} already exists".format(name)) # Add the actor to Redis if it does not already exist. _internal_kv_put(actor_name, pickle.dumps(actor_handle))
def _run(self): """Run the monitor loop.""" while True: self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() status = { "load_metrics_report": self.load_metrics.summary()._asdict() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def connect_ray_pdb(host=None, port=None, patch_stdstreams=False, quiet=None): """ Opens a remote PDB on first available port. """ if host is None: host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1") if port is None: port = int(os.environ.get("REMOTE_PDB_PORT", "0")) if quiet is None: quiet = bool(os.environ.get("REMOTE_PDB_QUIET", "")) rdb = RemotePdb(host=host, port=port, patch_stdstreams=patch_stdstreams, quiet=quiet) sockname = rdb._listen_socket.getsockname() pdb_address = "{}:{}".format(sockname[0], sockname[1]) parentframeinfo = inspect.getouterframes(inspect.currentframe())[2] data = { "proctitle": setproctitle.getproctitle(), "pdb_address": pdb_address, "filename": parentframeinfo.filename, "lineno": parentframeinfo.lineno, "traceback": "\n".join(traceback.format_exception(*sys.exc_info())) } breakpoint_uuid = uuid.uuid4() _internal_kv_put("RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True) rdb.listen() _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid)) return rdb
def _run(self): """Run the monitor loop.""" while True: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": self.load_metrics.summary()._asdict(), "time": time.time(), "monitor_pid": os.getpid() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status[ "autoscaler_report"] = self.autoscaler.summary()._asdict() for msg in self.event_summarizer.summary(): logger.info("{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg)) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) gcs_publisher = None if args.gcs_address: gcs_publisher = GcsPublisher(address=args.gcs_address) elif gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher)
def crashed_get_node_id(): if ray.get_runtime_context().node_id == crashed_worker_node_id: internal_kv._internal_kv_put("crashed_get_node_id", "crashed_worker_node_id") while True: time.sleep(1) else: return ray.get_runtime_context().node_id
def __init__(self, interval_s=1, total_steps=3): self.interval_s = interval_s self.stopped = False self.current_step = 1 self.total_steps = total_steps worker = ray._private.worker.global_worker worker_id = worker.core_worker.get_actor_id() ray_kv._internal_kv_put(f"JOB:{worker_id}", self.current_step, overwrite=True)
def put_status(self, job_id: str, status: Union[JobStatus, JobStatusInfo]): if isinstance(status, JobStatus): status = JobStatusInfo(status=status) elif not isinstance(status, JobStatusInfo): assert False, "status must be JobStatus or JobStatusInfo." _internal_kv_put(self.JOB_STATUS_KEY.format(job_id=job_id), pickle.dumps(status), namespace=ray_constants.KV_NAMESPACE_JOB)
def _run(self): """Run the monitor loop.""" while True: try: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": asdict(self.load_metrics.summary()), "time": time.time(), "monitor_pid": os.getpid(), } if self.autoscaler and not self.load_metrics: # load_metrics is Falsey iff we haven't collected any # resource messages from the GCS, which can happen at startup if # the GCS hasn't yet received data from the Raylets. # In this case, do not do an autoscaler update. # Wait to get load metrics. logger.info( "Autoscaler has not yet received load metrics. Waiting." ) elif self.autoscaler: # Process autoscaling actions self.autoscaler.update() autoscaler_summary = self.autoscaler.summary() if autoscaler_summary: status["autoscaler_report"] = asdict(autoscaler_summary) for msg in self.event_summarizer.summary(): # Need to prefix each line of the message for the lines to # get pushed to the driver logs. for line in msg.split("\n"): logger.info( "{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, line ) ) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True ) except Exception: # By default, do not exit the monitor on failure. if self.retry_on_failure: logger.exception("Monitor: Execution exception. Trying again...") else: raise # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def connect_ray_pdb( host=None, port=None, patch_stdstreams=False, quiet=None, breakpoint_uuid=None, debugger_external=False, ): """ Opens a remote PDB on first available port. """ if debugger_external: assert not host, "Cannot specify both host and debugger_external" host = "0.0.0.0" elif host is None: host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1") if port is None: port = int(os.environ.get("REMOTE_PDB_PORT", "0")) if quiet is None: quiet = bool(os.environ.get("REMOTE_PDB_QUIET", "")) if not breakpoint_uuid: breakpoint_uuid = uuid.uuid4().hex if debugger_external: ip_address = ray.worker.global_worker.node_ip_address else: ip_address = "localhost" rdb = RemotePdb( breakpoint_uuid=breakpoint_uuid, host=host, port=port, ip_address=ip_address, patch_stdstreams=patch_stdstreams, quiet=quiet, ) sockname = rdb._listen_socket.getsockname() pdb_address = "{}:{}".format(ip_address, sockname[1]) parentframeinfo = inspect.getouterframes(inspect.currentframe())[2] data = { "proctitle": setproctitle.getproctitle(), "pdb_address": pdb_address, "filename": parentframeinfo.filename, "lineno": parentframeinfo.lineno, "traceback": "\n".join(traceback.format_exception(*sys.exc_info())), "timestamp": time.time(), "job_id": ray.get_runtime_context().job_id.hex(), } _internal_kv_put( "RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True, namespace=ray_constants.KV_NAMESPACE_PDB, ) rdb.listen() _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid), namespace=ray_constants.KV_NAMESPACE_PDB) return rdb
def _store_package_in_gcs(gcs_key: str, data: bytes) -> int: if len(data) >= GCS_STORAGE_MAX_SIZE: raise RuntimeError( "working_dir package exceeds the maximum size of 100MiB. You " "can exclude large files using the 'excludes' option to the " "runtime_env.") _internal_kv_put(gcs_key, data) return len(data)
def _put_library_usage(library_usage: str): assert _internal_kv_initialized() try: _internal_kv_put( f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}", "", namespace=usage_constant.USAGE_STATS_NAMESPACE, ) except Exception as e: logger.debug(f"Failed to put library usage, {e}")
def do_remote(self, arg): """remote Skip into the next remote call. """ # Tell the next task to drop into the debugger. ray.worker.global_worker.debugger_breakpoint = self._breakpoint_uuid # Tell the debug loop to connect to the next task. _internal_kv_put("RAY_PDB_CONTINUE_{}".format(self._breakpoint_uuid), "") self.__restore() self.handle.connection.close() return Pdb.do_continue(self, arg)
def log_info_string(self, nodes): tmp = "Cluster status: " tmp += self.info_string(nodes) tmp += "\n" tmp += self.load_metrics.info_string() tmp += "\n" tmp += self.resource_demand_scheduler.debug_string( nodes, self.pending_launches.breakdown(), self.load_metrics.get_resource_utilization()) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True) logger.debug(tmp)
def put(self, key, val): """Put the key-value pair into the store. Args: key (str) val (bytes) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) if not isinstance(val, bytes): raise TypeError("val must be bytes, got: {}.".format(type(val))) ray_kv._internal_kv_put(self._format_key(key), val, overwrite=True)
def legacy_log_info_string(autoscaler, nodes): tmp = "Cluster status: " tmp += info_string(autoscaler, nodes) tmp += "\n" tmp += autoscaler.load_metrics.info_string() tmp += "\n" tmp += autoscaler.resource_demand_scheduler.debug_string( nodes, autoscaler.pending_launches.breakdown(), autoscaler.load_metrics.get_resource_utilization(), ) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True) logger.debug(tmp)
def update(self): try: self.reset(errors_fatal=False) self._update() except Exception as e: logger.exception("StandardAutoscaler: " "Error during autoscaling.") if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) self.num_failures += 1 if self.num_failures > self.max_failures: logger.critical("StandardAutoscaler: " "Too many errors, abort.") raise e
def do_remote(self, arg): """remote Skip into the next remote call. """ # Tell the next task to drop into the debugger. ray.worker.global_worker.debugger_breakpoint = self._breakpoint_uuid # Tell the debug loop to connect to the next task. data = json.dumps({ "job_id": ray.get_runtime_context().job_id.hex(), }) _internal_kv_put("RAY_PDB_CONTINUE_{}".format(self._breakpoint_uuid), data) self.__restore() self.handle.connection.close() return Pdb.do_continue(self, arg)
def put(self, key: str, val: bytes) -> bool: """Put the key-value pair into the store. Args: key (str) val (bytes) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) if not isinstance(val, bytes): raise TypeError("val must be bytes, got: {}.".format(type(val))) ray_kv._internal_kv_put(self.get_storage_key(key), val, overwrite=True, namespace=ray_constants.KV_NAMESPACE_SERVE)
def _store_package_in_gcs( pkg_uri: str, data: bytes, logger: Optional[logging.Logger] = default_logger) -> int: file_size = len(data) size_str = _mib_string(file_size) if len(data) >= GCS_STORAGE_MAX_SIZE: raise RuntimeError( f"Package size ({size_str}) exceeds the maximum size of " f"{_mib_string(GCS_STORAGE_MAX_SIZE)}. You can exclude large " "files using the 'excludes' option to the runtime_env.") logger.info(f"Pushing file package '{pkg_uri}' ({size_str}) to " "Ray cluster...") _internal_kv_put(pkg_uri, data) logger.info(f"Successfully pushed file package '{pkg_uri}'.") return len(data)
def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None: self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) from ray.utils import push_error_to_driver_through_redis push_error_to_driver_through_redis(redis_client, ray_constants.MONITOR_DIED_ERROR, message)
def _run(self): """Run the monitor loop.""" while True: try: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": asdict(self.load_metrics.summary()), "time": time.time(), "monitor_pid": os.getpid(), } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = asdict(self.autoscaler.summary()) for msg in self.event_summarizer.summary(): # Need to prefix each line of the message for the lines to # get pushed to the driver logs. for line in msg.split("\n"): logger.info( "{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, line ) ) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True ) except Exception: logger.exception("Monitor: Execution exception. Trying again...") # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def _register_actor(name, actor_handle): if not isinstance(name, str): raise TypeError("The name argument must be a string.") if not isinstance(actor_handle, ray.actor.ActorHandle): raise TypeError("The actor_handle argument must be an ActorHandle " "object.") actor_name = _calculate_key(name) # First check if the actor already exists. try: _get_actor(name) exists = True except ValueError: exists = False if exists: raise ValueError("An actor with name={} already exists".format(name)) # Add the actor to Redis if it does not already exist. _internal_kv_put(actor_name, pickle.dumps(actor_handle), overwrite=True)
def run(self): worker = ray.worker.global_worker worker_id = worker.core_worker.get_actor_id() while self.current_step <= self.total_steps: if not self.stopped: print(f"Sleeping {self.interval_s} secs to executing " f"step {self.current_step}") time.sleep(self.interval_s) self.current_step += 1 ray_kv._internal_kv_put(f"JOB:{worker_id}", self.current_step, overwrite=True) else: print("Stop called or reached final step.") break self.stopped = True ray_kv._internal_kv_put(f"JOB:{worker_id}", "DONE", overwrite=True) return "DONE"
def update(self): try: self.reset(errors_fatal=False) self._update() except Exception as e: logger.exception("StandardAutoscaler: " "Error during autoscaling.") if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) # Don't abort the autoscaler if the K8s API server is down. # https://github.com/ray-project/ray/issues/12255 is_k8s_connection_error = ( self.config["provider"]["type"] == "kubernetes" and isinstance(e, MaxRetryError)) if not is_k8s_connection_error: self.num_failures += 1 if self.num_failures > self.max_failures: logger.critical("StandardAutoscaler: " "Too many errors, abort.") raise e
def _store_package_in_gcs( pkg_uri: str, data: bytes, logger: Optional[logging.Logger] = default_logger, ) -> int: """Stores package data in the Global Control Store (GCS). Args: pkg_uri: The GCS key to store the data in. data: The serialized package's bytes to store in the GCS. logger (Optional[logging.Logger]): The logger used by this function. Return: int: Size of data Raises: RuntimeError: If the upload to the GCS fails. ValueError: If the data's size exceeds GCS_STORAGE_MAX_SIZE. """ file_size = len(data) size_str = _mib_string(file_size) if len(data) >= GCS_STORAGE_MAX_SIZE: raise ValueError( f"Package size ({size_str}) exceeds the maximum size of " f"{_mib_string(GCS_STORAGE_MAX_SIZE)}. You can exclude large " "files using the 'excludes' option to the runtime_env." ) logger.info(f"Pushing file package '{pkg_uri}' ({size_str}) to Ray cluster...") try: _internal_kv_put(pkg_uri, data) except Exception as e: raise RuntimeError( "Failed to store package in the GCS.\n" f" - GCS URI: {pkg_uri}\n" f" - Package data ({size_str}): {data[:15]}...\n" ) from e logger.info(f"Successfully pushed file package '{pkg_uri}'.") return len(data)
def _put_library_usage(library_usage: str): assert _internal_kv_initialized() try: _internal_kv_put( f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}", "", namespace=usage_constant.USAGE_STATS_NAMESPACE, ) except Exception as e: logger.debug(f"Failed to put library usage, {e}") # Record the library usage to the temp (e.g., /tmp/ray) folder. # Note that although we always write this file, it is not # reported when the usage stats is disabled. if ray._private.worker.global_worker.mode == ray.SCRIPT_MODE: try: lib_usage_recorder = LibUsageRecorder( ray._private.utils.get_ray_temp_dir()) lib_usage_recorder.put_lib_usage(library_usage) except Exception as e: logger.debug( f"Failed to write a library usage to the home folder, {e}")
def _run(self): """Run the monitor. This function loops forever, checking for messages about dead database clients and cleaning up state accordingly. """ self.subscribe(ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) # Handle messages from the subscription channels. while True: self.update_raylet_map() self.update_load_metrics() status = { "load_metrics_report": self.load_metrics.summary()._asdict() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Process a round of messages. self.process_messages() # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def register_actor(name, actor_handle): """Register a named actor under a string key. Args: name: The name of the named actor. actor_handle: The actor object to be associated with this name """ if not isinstance(name, str): raise TypeError("The name argument must be a string.") if not isinstance(actor_handle, ray.actor.ActorHandle): raise TypeError("The actor_handle argument must be an ActorHandle " "object.") actor_name = _calculate_key(name) pickled_state = pickle.dumps(actor_handle) # Add the actor to Redis if it does not already exist. already_exists = _internal_kv_put(actor_name, pickled_state) if already_exists: # If the registration fails, then erase the new actor handle that # was added when pickling the actor handle. actor_handle._ray_new_actor_handles.pop() raise ValueError( "Error: the actor with name={} already exists".format(name))
def _ack_reads(self, offset): if self.max_size > 0: internal_kv._internal_kv_put( self.read_ack_key, offset, overwrite=True)
def flush_values(self): for (category, key), value in self._to_flush.items(): _internal_kv_put(_make_key(category, key), value, overwrite=True) self._to_flush.clear()