def cleanup_actors(self): """Recreate any live actors whose corresponding local scheduler died. For any live actor whose local scheduler just died, we choose a new local scheduler and broadcast a notification to create that actor. """ actor_info = self.state.actors() for actor_id, info in actor_info.items(): if (not info["removed"] and info["local_scheduler_id"] in self.dead_local_schedulers): # Choose a new local scheduler to run the actor. local_scheduler_id = ray.utils.select_local_scheduler( info["driver_id"], self.state.local_schedulers(), info["num_gpus"], self.redis) import sys sys.stdout.flush() # The new local scheduler should not be the same as the old # local scheduler. TODO(rkn): This should not be an assert, it # should be something more benign. assert (binary_to_hex(local_scheduler_id) != info["local_scheduler_id"]) # Announce to all of the local schedulers that the actor should # be recreated on this new local scheduler. ray.utils.publish_actor_creation( hex_to_binary(actor_id), hex_to_binary(info["driver_id"]), local_scheduler_id, True, self.redis) log.info( "Actor {} for driver {} was on dead local scheduler " "{}. It is being recreated on local scheduler {}".format( actor_id, info["driver_id"], info["local_scheduler_id"], binary_to_hex(local_scheduler_id))) # Update the actor info in Redis. self.redis.hset(b"Actor:" + hex_to_binary(actor_id), "local_scheduler_id", local_scheduler_id)
def cleanup_task_table(self): """Clean up global state for failed local schedulers. This marks any tasks that were scheduled on dead local schedulers as TASK_STATUS_LOST. A local scheduler is deemed dead if it is in self.dead_local_schedulers. """ tasks = self.state.task_table() num_tasks_updated = 0 for task_id, task in tasks.items(): # See if the corresponding local scheduler is alive. if task["LocalSchedulerID"] not in self.dead_local_schedulers: continue # Remove dummy objects returned by actor tasks from any plasma # manager. Although the objects may still exist in that object # store, this deletion makes them effectively unreachable by any # local scheduler connected to a different store. # TODO(swang): Actually remove the objects from the object store, # so that the reconstructed actor can reuse the same object store. if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID: dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1] obj = self.state.object_table(dummy_object_id) manager_ids = obj["ManagerIDs"] if manager_ids is not None: # The dummy object should exist on at most one plasma # manager, the manager associated with the local scheduler # that died. assert len(manager_ids) <= 1 # Remove the dummy object from the plasma manager # associated with the dead local scheduler, if any. for manager in manager_ids: ok = self.state._execute_command( dummy_object_id, "RAY.OBJECT_TABLE_REMOVE", dummy_object_id.id(), hex_to_binary(manager)) if ok != b"OK": log.warn("Failed to remove object location for " "dead plasma manager.") # If the task is scheduled on a dead local scheduler, mark the # task as lost. key = binary_to_object_id(hex_to_binary(task_id)) ok = self.state._execute_command( key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id), ray.experimental.state.TASK_STATUS_LOST, NIL_ID, task["ExecutionDependenciesString"], task["SpillbackCount"]) if ok != b"OK": log.warn("Failed to update lost task for dead scheduler.") num_tasks_updated += 1 if num_tasks_updated > 0: log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def _object_table(self, object_id): """Fetch and parse the object table information for a single object ID. Args: object_id: An object ID to get information about. Returns: A dictionary with information about the object ID in question. """ # Allow the argument to be either an ObjectID or a hex string. if not isinstance(object_id, ray.ObjectID): object_id = ray.ObjectID(hex_to_binary(object_id)) # Return information about a single object ID. message = self._execute_command(object_id, "RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.OBJECT, "", object_id.binary()) if message is None: return {} gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry( message, 0) assert gcs_entry.EntriesLength() > 0 entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData( gcs_entry.Entries(0), 0) object_info = { "DataSize": entry.ObjectSize(), "Manager": entry.Manager(), } return object_info
def task_table(self, task_id=None): """Fetch and parse the task table information for one or more task IDs. Args: task_id: A hex string of the task ID to fetch information about. If this is None, then the task object table is fetched. Returns: Information from the task table. """ self._check_connected() if task_id is not None: task_id = ray.TaskID(hex_to_binary(task_id)) return self._task_table(task_id) else: task_table_keys = self._keys( ray.gcs_utils.TablePrefix_RAYLET_TASK_string + "*") task_ids_binary = [ key[len(ray.gcs_utils.TablePrefix_RAYLET_TASK_string):] for key in task_table_keys ] results = {} for task_id_binary in task_ids_binary: results[binary_to_hex(task_id_binary)] = self._task_table( ray.TaskID(task_id_binary)) return results
def _xray_clean_up_entries_for_driver(self, driver_id): """Remove this driver's object/task entries from redis. Removes control-state entries of all tasks and task return objects belonging to the driver. Args: driver_id: The driver id. """ xray_task_table_prefix = ( ray.gcs_utils.TablePrefix_RAYLET_TASK_string.encode("ascii")) xray_object_table_prefix = ( ray.gcs_utils.TablePrefix_OBJECT_string.encode("ascii")) task_table_objects = self.state.task_table() driver_id_hex = binary_to_hex(driver_id) driver_task_id_bins = set() for task_id_hex, task_info in task_table_objects.items(): task_table_object = task_info["TaskSpec"] task_driver_id_hex = task_table_object["DriverID"] if driver_id_hex != task_driver_id_hex: # Ignore tasks that aren't from this driver. continue driver_task_id_bins.add(hex_to_binary(task_id_hex)) # Get objects associated with the driver. object_table_objects = self.state.object_table() driver_object_id_bins = set() for object_id, _ in object_table_objects.items(): task_id_bin = ray.raylet.compute_task_id(object_id).id() if task_id_bin in driver_task_id_bins: driver_object_id_bins.add(object_id.id()) def to_shard_index(id_bin): return binary_to_object_id(id_bin).redis_shard_hash() % len( self.state.redis_clients) # Form the redis keys to delete. sharded_keys = [[] for _ in range(len(self.state.redis_clients))] for task_id_bin in driver_task_id_bins: sharded_keys[to_shard_index(task_id_bin)].append( xray_task_table_prefix + task_id_bin) for object_id_bin in driver_object_id_bins: sharded_keys[to_shard_index(object_id_bin)].append( xray_object_table_prefix + object_id_bin) # Remove with best effort. for shard_index in range(len(sharded_keys)): keys = sharded_keys[shard_index] if len(keys) == 0: continue redis = self.state.redis_clients[shard_index] num_deleted = redis.delete(*keys) logger.info("Removed {} dead redis entries of the driver from" " redis shard {}.".format(num_deleted, shard_index)) if num_deleted != len(keys): logger.warning("Failed to remove {} relevant redis entries" " from redis shard {}.".format( len(keys) - num_deleted, shard_index))
def cleanup_object_table(self): """Clean up global state for failed plasma managers. This removes dead plasma managers from any location entries in the object table. A plasma manager is deemed dead if it is in self.dead_plasma_managers. """ # TODO(swang): Also kill the associated plasma store, since it's no # longer reachable without a plasma manager. objects = self.state.object_table() num_objects_removed = 0 for object_id, obj in objects.items(): manager_ids = obj["ManagerIDs"] if manager_ids is None: continue for manager in manager_ids: if manager in self.dead_plasma_managers: # If the object was on a dead plasma manager, remove that # location entry. ok = self.state._execute_command( object_id, "RAY.OBJECT_TABLE_REMOVE", object_id.id(), hex_to_binary(manager)) if ok != b"OK": log.warn("Failed to remove object location for dead " "plasma manager.") num_objects_removed += 1 if num_objects_removed > 0: log.warn("Marked {} objects as lost.".format(num_objects_removed))
def cleanup_object_table(self): """Clean up global state for failed plasma managers. This removes dead plasma managers from any location entries in the object table. A plasma manager is deemed dead if it is in self.dead_plasma_managers. """ # TODO(swang): Also kill the associated plasma store, since it's no # longer reachable without a plasma manager. objects = self.state.object_table() num_objects_removed = 0 for object_id, obj in objects.items(): manager_ids = obj["ManagerIDs"] if manager_ids is None: continue for manager in manager_ids: if manager in self.dead_plasma_managers: # If the object was on a dead plasma manager, remove that # location entry. ok = self.state._execute_command(object_id, "RAY.OBJECT_TABLE_REMOVE", object_id.id(), hex_to_binary(manager)) if ok != b"OK": log.warn("Failed to remove object location for dead " "plasma manager.") num_objects_removed += 1 if num_objects_removed > 0: log.warn("Marked {} objects as lost.".format(num_objects_removed))
def actor_table(self, actor_id): """Fetch and parse the actor table information for a single actor ID. Args: actor_id: A hex string of the actor ID to fetch information about. If this is None, then the actor table is fetched. Returns: Information from the actor table. """ self._check_connected() if actor_id is not None: actor_id = ray.ActorID(hex_to_binary(actor_id)) actor_info = self.global_state_accessor.get_actor_info(actor_id) if actor_info is None: return {} else: actor_table_data = gcs_utils.ActorTableData.FromString( actor_info) return self._gen_actor_info(actor_table_data) else: actor_table = self.global_state_accessor.get_actor_table() results = {} for i in range(len(actor_table)): actor_table_data = gcs_utils.ActorTableData.FromString( actor_table[i]) results[binary_to_hex(actor_table_data.actor_id)] = \ self._gen_actor_info(actor_table_data) return results
def placement_group_table(self, placement_group_id=None): self._check_connected() if placement_group_id is not None: placement_group_id = ray.PlacementGroupID( hex_to_binary(placement_group_id.hex())) placement_group_info = ( self.global_state_accessor.get_placement_group_info( placement_group_id)) if placement_group_info is None: return {} else: placement_group_info = (gcs_utils.PlacementGroupTableData. FromString(placement_group_info)) return self._gen_placement_group_info(placement_group_info) else: placement_group_table = self.global_state_accessor.\ get_placement_group_table() results = {} for placement_group_info in placement_group_table: placement_group_table_data = gcs_utils.\ PlacementGroupTableData.FromString(placement_group_info) placement_group_id = binary_to_hex( placement_group_table_data.placement_group_id) results[placement_group_id] = \ self._gen_placement_group_info(placement_group_table_data) return results
def task_table(self, task_id=None): """Fetch and parse the task table information for one or more task IDs. Args: task_id: A hex string of the task ID to fetch information about. If this is None, then the task object table is fetched. Returns: Information from the task table. """ self._check_connected() if task_id is not None: task_id = ray.TaskID(hex_to_binary(task_id)) return self._task_table(task_id) else: task_table_keys = self._keys( gcs_utils.TablePrefix_RAYLET_TASK_string + "*") task_ids_binary = [ key[len(gcs_utils.TablePrefix_RAYLET_TASK_string):] for key in task_table_keys ] results = {} for task_id_binary in task_ids_binary: results[binary_to_hex(task_id_binary)] = self._task_table( ray.TaskID(task_id_binary)) return results
def object_table(self, object_ref=None): """Fetch and parse the object table info for one or more object refs. Args: object_ref: An object ref to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_ref is not None: object_ref = ray.ObjectRef(hex_to_binary(object_ref)) object_info = self.global_state_accessor.get_object_info( object_ref) if object_info is None: return {} else: object_location_info = gcs_utils.ObjectLocationInfo.FromString( object_info) return self._gen_object_info(object_location_info) else: object_table = self.global_state_accessor.get_object_table() results = {} for i in range(len(object_table)): object_location_info = gcs_utils.ObjectLocationInfo.FromString( object_table[i]) results[binary_to_hex(object_location_info.object_id)] = \ self._gen_object_info(object_location_info) return results
def _object_table(self, object_id): """Fetch and parse the object table information for a single object ID. Args: object_id: An object ID to get information about. Returns: A dictionary with information about the object ID in question. """ # Allow the argument to be either an ObjectID or a hex string. if not isinstance(object_id, ray.ObjectID): object_id = ray.ObjectID(hex_to_binary(object_id)) # Return information about a single object ID. message = self._execute_command(object_id, "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("OBJECT"), "", object_id.binary()) if message is None: return {} gcs_entry = gcs_utils.GcsEntry.FromString(message) assert len(gcs_entry.entries) > 0 entry = gcs_utils.ObjectTableData.FromString(gcs_entry.entries[0]) object_info = { "DataSize": entry.object_size, "Manager": entry.manager, } return object_info
def actor_table(self, actor_id=None): """Fetch and parse the actor table information for one or more actor IDs. Args: actor_id: A hex string of the actor ID to fetch information about. If this is None, then the actor table is fetched. Returns: Information from the actor table. """ self._check_connected() if actor_id is not None: actor_id = ray.ActorID(hex_to_binary(actor_id)) return self._actor_table(actor_id) else: actor_table_keys = list( self.redis_client.scan_iter( match=gcs_utils.TablePrefix_ACTOR_string + "*")) actor_ids_binary = [ key[len(gcs_utils.TablePrefix_ACTOR_string):] for key in actor_table_keys ] results = {} for actor_id_binary in actor_ids_binary: results[binary_to_hex(actor_id_binary)] = self._actor_table( ray.ActorID(actor_id_binary)) return results
def select_local_scheduler(local_schedulers, num_gpus, worker): """Select a local scheduler to assign this actor to. Args: local_schedulers: A list of dictionaries of information about the local schedulers. num_gpus (int): The number of GPUs that must be reserved for this actor. Returns: The ID of the local scheduler that has been chosen. Raises: Exception: An exception is raised if no local scheduler can be found with sufficient resources. """ driver_id = worker.task_driver_id.id() local_scheduler_id = None # Loop through all of the local schedulers in a random order. local_schedulers = np.random.permutation(local_schedulers) for local_scheduler in local_schedulers: if local_scheduler["NumCPUs"] < 1: continue if local_scheduler["NumGPUs"] < num_gpus: continue if num_gpus == 0: local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"]) break else: # Try to reserve enough GPUs on this local scheduler. success = attempt_to_reserve_gpus(num_gpus, driver_id, local_scheduler, worker) if success: local_scheduler_id = hex_to_binary( local_scheduler["DBClientID"]) break if local_scheduler_id is None: raise Exception("Could not find a node with enough GPUs or other " "resources to create this actor. The local scheduler " "information is {}.".format(local_schedulers)) return local_scheduler_id
def select_local_scheduler(local_schedulers, num_gpus, worker): """Select a local scheduler to assign this actor to. Args: local_schedulers: A list of dictionaries of information about the local schedulers. num_gpus (int): The number of GPUs that must be reserved for this actor. Returns: A tuple of the ID of the local scheduler that has been chosen and a list of the gpu_ids that are reserved for the actor. Raises: Exception: An exception is raised if no local scheduler can be found with sufficient resources. """ driver_id = worker.task_driver_id.id() if num_gpus == 0: local_scheduler_id = hex_to_binary( random.choice(local_schedulers)["DBClientID"]) gpus_aquired = [] else: # All of this logic is for finding a local scheduler that has enough # available GPUs. local_scheduler_id = None # Loop through all of the local schedulers. for local_scheduler in local_schedulers: # Try to reserve enough GPUs on this local scheduler. gpus_aquired = attempt_to_reserve_gpus(num_gpus, driver_id, local_scheduler, worker) if len(gpus_aquired) == num_gpus: local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"]) break else: # We should have either acquired as many GPUs as we need or none. assert len(gpus_aquired) == 0 if local_scheduler_id is None: raise Exception("Could not find a node with enough GPUs to create this " "actor. The local scheduler information is {}." .format(local_schedulers)) return local_scheduler_id, gpus_aquired
def __setstate__(self, state): state["resources"] = json_to_resources(state["resources"]) if state["status"] == Trial.RUNNING: state["status"] = Trial.PENDING for key in self._nonjson_fields: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) validate_trainable(self.trainable_name) self.init_logdir() # Create logdir if it does not exist
def _check(): resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_summary = result["data"]["summary"] assert len(job_summary) == 1, resp.text one_job = job_summary[0] assert "jobId" in one_job job_id = one_job["jobId"] assert ray._raylet.JobID(hex_to_binary(one_job["jobId"])) assert "driverIpAddress" in one_job assert one_job["driverIpAddress"] == ip assert "driverPid" in one_job assert one_job["driverPid"] == str(os.getpid()) assert "config" in one_job assert type(one_job["config"]) is dict assert "isDead" in one_job assert one_job["isDead"] is False assert "timestamp" in one_job one_job_summary_keys = one_job.keys() resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_detail = result["data"]["detail"] assert "jobInfo" in job_detail assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0 assert "jobActors" in job_detail job_actors = job_detail["jobActors"] assert len(job_actors) == 1, resp.text one_job_actor = job_actors[actor_id] assert "taskSpec" in one_job_actor assert type(one_job_actor["taskSpec"]) is dict assert "functionDescriptor" in one_job_actor["taskSpec"] assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict assert "pid" in one_job_actor assert one_job_actor["pid"] == actor_pid check_actor_keys = [ "name", "timestamp", "address", "actorId", "jobId", "state" ] for k in check_actor_keys: assert k in one_job_actor assert "jobWorkers" in job_detail job_workers = job_detail["jobWorkers"] assert len(job_workers) == 1, resp.text one_job_worker = job_workers[0] check_worker_keys = [ "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent", "coreWorkerStats", "language", "jobId" ] for k in check_worker_keys: assert k in one_job_worker
def __setstate__(self, state): logger_started = state.pop("__logger_started__") state["resources"] = json_to_resources(state["resources"]) for key in [ "_checkpoint", "config", "custom_loggers", "sync_function" ]: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) Trial._registration_check(self.trainable_name) if logger_started: self.init_logger()
def __setstate__(self, state): logger_started = state.pop("__logger_started__") state["resources"] = json_to_resources(state["resources"]) if state["status"] == Trial.RUNNING: state["status"] = Trial.PENDING for key in self._nonjson_fields: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) validate_trainable(self.trainable_name) if logger_started: self.init_logger()
def __setstate__(self, state): logger_started = state.pop("__logger_started__") state["resources"] = json_to_resources(state["resources"]) if state["status"] == Trial.RUNNING: state["status"] = Trial.PENDING for key in self._nonjson_fields: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) Trial._registration_check(self.trainable_name) if logger_started: self.init_logger()
def __setstate__(self, state): logger_started = state.pop("__logger_started__") state["resources"] = json_to_resources(state["resources"]) for key in [ "_checkpoint", "config", "custom_loggers", "sync_function", "last_result" ]: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) Trial._registration_check(self.trainable_name) if logger_started: self.init_logger()
def index(): """TODO (dsuo): Add comments. TODO (dsuo): Maybe care about batching TODO (dsuo): REST is nice conceptually, but probably too much overhead TODO (dsuo): Move logic to C++ (keep as head node process) NOTE: We do an extra serialize during get and extra deserialize during put. """ if request.method == 'POST': raw_object_id = request.files['object_id'].read() object_id = ray.pyarrow.plasma.ObjectID(raw_object_id) # NOTE (dsuo): we should use readinto in the future if possible. # Otherwise, we create a throwaway buffer when we read the whole # stream of data. Might look something like this: # # request.files['value'].readinto(buf) # # Unfortunately, SpooledTemporaryFile request.files['value'] # doesn't implement. See here: https://bugs.python.org/issue32600. data = request.files['value'].read() # Get a memoryview buffer of type unsigned bytes buf = memoryview(plasma_client.create(object_id, len(data))).cast("B") # Copy data into plasma buffer buf[:] = data plasma_client.seal(object_id) return raw_object_id, 402 elif request.method == 'GET' and 'object_ids' in request.args: object_ids = [ ray.pyarrow.plasma.ObjectID(hex_to_binary(object_id)) for object_id in request.args['object_ids'].split(",") ] # Fetch remote objects # TODO (dsuo): maybe care about batching # NOTE: this is a really flaky test for "simple value" # NOTE: we don't support retrieving multiple objectIDs at a time data = plasma_client.get_buffers(object_ids)[0] # Return an appropriate return code? return send_file(io.BytesIO(data.to_pybytes()), mimetype="application/octet-stream") else: return '''
def cleanup_task_table(self): """Clean up global state for failed local schedulers. This marks any tasks that were scheduled on dead local schedulers as TASK_STATUS_LOST. A local scheduler is deemed dead if it is in self.dead_local_schedulers. """ tasks = self.state.task_table() num_tasks_updated = 0 for task_id, task in tasks.items(): # See if the corresponding local scheduler is alive. if task["LocalSchedulerID"] in self.dead_local_schedulers: # If the task is scheduled on a dead local scheduler, mark the task as # lost. key = binary_to_object_id(hex_to_binary(task_id)) ok = self.state._execute_command( key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id), ray.experimental.state.TASK_STATUS_LOST, NIL_ID) if ok != b"OK": log.warn("Failed to update lost task for dead scheduler.") num_tasks_updated += 1 if num_tasks_updated > 0: log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def cleanup_actors(self): """Recreate any live actors whose corresponding local scheduler died. For any live actor whose local scheduler just died, we choose a new local scheduler and broadcast a notification to create that actor. """ actor_info = self.state.actors() for actor_id, info in actor_info.items(): if (not info["removed"] and info["local_scheduler_id"] in self.dead_local_schedulers): # Choose a new local scheduler to run the actor. local_scheduler_id = ray.utils.select_local_scheduler( info["driver_id"], self.state.local_schedulers(), info["num_gpus"], self.redis) import sys sys.stdout.flush() # The new local scheduler should not be the same as the old # local scheduler. TODO(rkn): This should not be an assert, it # should be something more benign. assert (binary_to_hex(local_scheduler_id) != info["local_scheduler_id"]) # Announce to all of the local schedulers that the actor should # be recreated on this new local scheduler. ray.utils.publish_actor_creation( hex_to_binary(actor_id), hex_to_binary(info["driver_id"]), local_scheduler_id, True, self.redis) log.info("Actor {} for driver {} was on dead local scheduler " "{}. It is being recreated on local scheduler {}" .format(actor_id, info["driver_id"], info["local_scheduler_id"], binary_to_hex(local_scheduler_id))) # Update the actor info in Redis. self.redis.hset(b"Actor:" + hex_to_binary(actor_id), "local_scheduler_id", local_scheduler_id)
def __setstate__(self, state): state["resources"] = json_to_resources(state["resources"]) if state["status"] == Trial.RUNNING: state["status"] = Trial.PENDING for key in self._nonjson_fields: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) validate_trainable(self.trainable_name) # Avoid creating logdir in client mode for returned trial results, # since the dir might not be creatable locally. TODO(ekl) thsi is kind # of a hack. if not ray.util.client.ray.is_connected(): self.init_logdir() # Create logdir if it does not exist
def placement_group_table(self, placement_group_id=None): self._check_connected() if placement_group_id is not None: placement_group_id = ray.PlacementGroupID( hex_to_binary(placement_group_id.hex())) placement_group_info = ( self.global_state_accessor.get_placement_group_info( placement_group_id)) if placement_group_info is None: return {} else: placement_group_info = (gcs_utils.PlacementGroupTableData. FromString(placement_group_info)) return self._gen_placement_group_info(placement_group_info) else: raise NotImplementedError( "Get all placement group is not implemented yet.")
def get_placement_group(placement_group_name: str): """Get a placement group object with a global name. Returns: None if can't find a placement group with the given name. The placement group object otherwise. """ if not placement_group_name: raise ValueError( "Please supply a non-empty value to get_placement_group") worker = ray.worker.global_worker worker.check_connected() placement_group_info = ray.state.state.get_placement_group_by_name( placement_group_name) if placement_group_info is None: raise ValueError( f"Failed to look up actor with name: {placement_group_name}") else: return PlacementGroup( PlacementGroupID( hex_to_binary(placement_group_info["placement_group_id"])))
def _job_table(self, job_id): """Fetch and parse the job table information for a single job ID. Args: job_id: A job ID or hex string to get information about. Returns: A dictionary with information about the job ID in question. """ # Allow the argument to be either a JobID or a hex string. if not isinstance(job_id, ray.JobID): assert isinstance(job_id, str) job_id = ray.JobID(hex_to_binary(job_id)) # Return information about a single job ID. message = self.redis_client.execute_command( "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("JOB"), "", job_id.binary()) if message is None: return {} gcs_entry = gcs_utils.GcsEntry.FromString(message) assert len(gcs_entry.entries) > 0 job_info = {} for i in range(len(gcs_entry.entries)): entry = gcs_utils.JobTableData.FromString(gcs_entry.entries[i]) assert entry.job_id == job_id.binary() job_info["JobID"] = job_id.hex() job_info["NodeManagerAddress"] = entry.node_manager_address job_info["DriverPid"] = entry.driver_pid if entry.is_dead: job_info["StopTime"] = entry.timestamp else: job_info["StartTime"] = entry.timestamp return job_info
def task_table(self, task_id=None): """Fetch and parse the task table information for one or more task IDs. Args: task_id: A hex string of the task ID to fetch information about. If this is None, then the task object table is fetched. Returns: Information from the task table. """ self._check_connected() if task_id is not None: return self._task_table(hex_to_binary(task_id)) else: task_table_keys = self.redis_client.keys(TASK_PREFIX + "*") results = {} for key in task_table_keys: task_id_binary = key[len(TASK_PREFIX):] results[binary_to_hex(task_id_binary)] = self._task_table( task_id_binary) return results
def _object_table(self, object_id): """Fetch and parse the object table information for a single object ID. Args: object_id_binary: A string of bytes with the object ID to get information about. Returns: A dictionary with information about the object ID in question. """ # Allow the argument to be either an ObjectID or a hex string. if not isinstance(object_id, ray.local_scheduler.ObjectID): object_id = ray.local_scheduler.ObjectID(hex_to_binary(object_id)) # Return information about a single object ID. object_locations = self._execute_command(object_id, "RAY.OBJECT_TABLE_LOOKUP", object_id.id()) if object_locations is not None: manager_ids = [ binary_to_hex(manager_id) for manager_id in object_locations ] else: manager_ids = None result_table_response = self._execute_command( object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id()) result_table_message = ResultTableReply.GetRootAsResultTableReply( result_table_response, 0) result = { "ManagerIDs": manager_ids, "TaskID": binary_to_hex(result_table_message.TaskId()), "IsPut": bool(result_table_message.IsPut()), "DataSize": result_table_message.DataSize(), "Hash": binary_to_hex(result_table_message.Hash()) } return result
def node_resource_table(self, node_id=None): """Fetch and parse the node resource table info for one. Args: node_id: An node ID to fetch information about. Returns: Information from the node resource table. """ self._check_connected() node_id = ray.NodeID(hex_to_binary(node_id)) node_resource_bytes = \ self.global_state_accessor.get_node_resource_info(node_id) if node_resource_bytes is None: return {} else: node_resource_info = gcs_utils.ResourceMap.FromString( node_resource_bytes) return { key: value.resource_capacity for key, value in node_resource_info.items.items() }
def task_table(self, task_id=None): """Fetch and parse the task table information for one or more task IDs. Args: task_id: A hex string of the task ID to fetch information about. If this is None, then the task object table is fetched. Returns: Information from the task table. """ self._check_connected() if task_id is not None: task_id = ray.local_scheduler.ObjectID(hex_to_binary(task_id)) return self._task_table(task_id) else: task_table_keys = self._keys(TASK_PREFIX + "*") results = {} for key in task_table_keys: task_id_binary = key[len(TASK_PREFIX):] results[binary_to_hex(task_id_binary)] = self._task_table( ray.local_scheduler.ObjectID(task_id_binary)) return results
def _object_table(self, object_id): """Fetch and parse the object table information for a single object ID. Args: object_id_binary: A string of bytes with the object ID to get information about. Returns: A dictionary with information about the object ID in question. """ # Allow the argument to be either an ObjectID or a hex string. if not isinstance(object_id, ray.local_scheduler.ObjectID): object_id = ray.local_scheduler.ObjectID(hex_to_binary(object_id)) # Return information about a single object ID. object_locations = self._execute_command(object_id, "RAY.OBJECT_TABLE_LOOKUP", object_id.id()) if object_locations is not None: manager_ids = [binary_to_hex(manager_id) for manager_id in object_locations] else: manager_ids = None result_table_response = self._execute_command( object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id()) result_table_message = ResultTableReply.GetRootAsResultTableReply( result_table_response, 0) result = {"ManagerIDs": manager_ids, "TaskID": binary_to_hex(result_table_message.TaskId()), "IsPut": bool(result_table_message.IsPut()), "DataSize": result_table_message.DataSize(), "Hash": binary_to_hex(result_table_message.Hash())} return result
def _from_cloudpickle(self, obj): return cloudpickle.loads(hex_to_binary(obj["value"]))
def _load_trial_info(self, trial_info): trial_info["config"] = cloudpickle.loads( hex_to_binary(trial_info["config"])) trial_info["result"] = cloudpickle.loads( hex_to_binary(trial_info["result"]))
def hex_to_object_id(hex_id): return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))
def object_hook(self, obj): if obj.get("_type") == "function": return cloudpickle.loads(hex_to_binary(obj["value"])) return obj