def object_table(self, object_id=None): """Fetch and parse the object table info for one or more object IDs. Args: object_id: An object ID to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_id is not None: # Return information about a single object ID. return self._object_table(object_id) else: # Return the entire object table. object_info_keys = self._keys(OBJECT_INFO_PREFIX + "*") object_location_keys = self._keys(OBJECT_LOCATION_PREFIX + "*") object_ids_binary = set( [key[len(OBJECT_INFO_PREFIX):] for key in object_info_keys] + [key[len(OBJECT_LOCATION_PREFIX):] for key in object_location_keys]) results = {} for object_id_binary in object_ids_binary: results[binary_to_object_id(object_id_binary)] = ( self._object_table(binary_to_object_id(object_id_binary))) return results
def object_table(self, object_id=None): """Fetch and parse the object table info for one or more object IDs. Args: object_id: An object ID to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_id is not None: # Return information about a single object ID. return self._object_table(object_id) else: # Return the entire object table. object_keys = self._keys(gcs_utils.TablePrefix_OBJECT_string + "*") object_ids_binary = { key[len(gcs_utils.TablePrefix_OBJECT_string):] for key in object_keys } results = {} for object_id_binary in object_ids_binary: results[binary_to_object_id(object_id_binary)] = ( self._object_table(binary_to_object_id(object_id_binary))) return results
def object_table(self, object_id=None): """Fetch and parse the object table info for one or more object IDs. Args: object_id: An object ID to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_id is not None: # Return information about a single object ID. return self._object_table(object_id) else: # Return the entire object table. object_info_keys = self._keys(OBJECT_INFO_PREFIX + "*") object_location_keys = self._keys(OBJECT_LOCATION_PREFIX + "*") object_ids_binary = set( [key[len(OBJECT_INFO_PREFIX):] for key in object_info_keys] + [ key[len(OBJECT_LOCATION_PREFIX):] for key in object_location_keys ]) results = {} for object_id_binary in object_ids_binary: results[binary_to_object_id(object_id_binary)] = ( self._object_table(binary_to_object_id(object_id_binary))) return results
def object_table(self, object_id=None): """Fetch and parse the object table info for one or more object IDs. Args: object_id: An object ID to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_id is not None: # Return information about a single object ID. return self._object_table(object_id) else: # Return the entire object table. object_keys = self._keys(ray.gcs_utils.TablePrefix_OBJECT_string + "*") object_ids_binary = { key[len(ray.gcs_utils.TablePrefix_OBJECT_string):] for key in object_keys } results = {} for object_id_binary in object_ids_binary: results[binary_to_object_id(object_id_binary)] = ( self._object_table(binary_to_object_id(object_id_binary))) return results
def _task_table(self, task_id): """Fetch and parse the task table information for a single task ID. Args: task_id_binary: A string of bytes with the task ID to get information about. Returns: A dictionary with information about the task ID in question. TASK_STATUS_MAPPING should be used to parse the "State" field into a human-readable string. """ task_table_response = self._execute_command(task_id, "RAY.TASK_TABLE_GET", task_id.id()) if task_table_response is None: raise Exception("There is no entry for task ID {} in the task " "table.".format(binary_to_hex(task_id.id()))) task_table_message = TaskReply.GetRootAsTaskReply(task_table_response, 0) task_spec = task_table_message.TaskSpec() task_spec_message = TaskInfo.GetRootAsTaskInfo(task_spec, 0) args = [] for i in range(task_spec_message.ArgsLength()): arg = task_spec_message.Args(i) if len(arg.ObjectId()) != 0: args.append(binary_to_object_id(arg.ObjectId())) else: args.append(pickle.loads(arg.Data())) # TODO(atumanov): Instead of hard coding these indices, we should use # the flatbuffer constants. assert task_spec_message.RequiredResourcesLength() == 3 required_resources = { "CPUs": task_spec_message.RequiredResources(0), "GPUs": task_spec_message.RequiredResources(1), "CustomResource": task_spec_message.RequiredResources(2)} task_spec_info = { "DriverID": binary_to_hex(task_spec_message.DriverId()), "TaskID": binary_to_hex(task_spec_message.TaskId()), "ParentTaskID": binary_to_hex(task_spec_message.ParentTaskId()), "ParentCounter": task_spec_message.ParentCounter(), "ActorID": binary_to_hex(task_spec_message.ActorId()), "ActorCounter": task_spec_message.ActorCounter(), "FunctionID": binary_to_hex(task_spec_message.FunctionId()), "Args": args, "ReturnObjectIDs": [binary_to_object_id( task_spec_message.Returns(i)) for i in range( task_spec_message.ReturnsLength())], "RequiredResources": required_resources} return {"State": task_table_message.State(), "LocalSchedulerID": binary_to_hex( task_table_message.LocalSchedulerId()), "TaskSpec": task_spec_info}
def to_shard_index(id_bin): if len(id_bin) == ray.TaskID.size(): return binary_to_task_id(id_bin).redis_shard_hash() % len( ray.state.state.redis_clients) else: return binary_to_object_id(id_bin).redis_shard_hash() % len( ray.state.state.redis_clients)
def cleanup_task_table(self): """Clean up global state for failed local schedulers. This marks any tasks that were scheduled on dead local schedulers as TASK_STATUS_LOST. A local scheduler is deemed dead if it is in self.dead_local_schedulers. """ tasks = self.state.task_table() num_tasks_updated = 0 for task_id, task in tasks.items(): # See if the corresponding local scheduler is alive. if task["LocalSchedulerID"] not in self.dead_local_schedulers: continue # Remove dummy objects returned by actor tasks from any plasma # manager. Although the objects may still exist in that object # store, this deletion makes them effectively unreachable by any # local scheduler connected to a different store. # TODO(swang): Actually remove the objects from the object store, # so that the reconstructed actor can reuse the same object store. if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID: dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1] obj = self.state.object_table(dummy_object_id) manager_ids = obj["ManagerIDs"] if manager_ids is not None: # The dummy object should exist on at most one plasma # manager, the manager associated with the local scheduler # that died. assert len(manager_ids) <= 1 # Remove the dummy object from the plasma manager # associated with the dead local scheduler, if any. for manager in manager_ids: ok = self.state._execute_command( dummy_object_id, "RAY.OBJECT_TABLE_REMOVE", dummy_object_id.id(), hex_to_binary(manager)) if ok != b"OK": log.warn("Failed to remove object location for " "dead plasma manager.") # If the task is scheduled on a dead local scheduler, mark the # task as lost. key = binary_to_object_id(hex_to_binary(task_id)) ok = self.state._execute_command( key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id), ray.experimental.state.TASK_STATUS_LOST, NIL_ID, task["ExecutionDependenciesString"], task["SpillbackCount"]) if ok != b"OK": log.warn("Failed to update lost task for dead scheduler.") num_tasks_updated += 1 if num_tasks_updated > 0: log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def cleanup_task_table(self): """Clean up global state for failed local schedulers. This marks any tasks that were scheduled on dead local schedulers as TASK_STATUS_LOST. A local scheduler is deemed dead if it is in self.dead_local_schedulers. """ tasks = self.state.task_table() num_tasks_updated = 0 for task_id, task in tasks.items(): # See if the corresponding local scheduler is alive. if task["LocalSchedulerID"] not in self.dead_local_schedulers: continue # Remove dummy objects returned by actor tasks from any plasma # manager. Although the objects may still exist in that object # store, this deletion makes them effectively unreachable by any # local scheduler connected to a different store. # TODO(swang): Actually remove the objects from the object store, # so that the reconstructed actor can reuse the same object store. if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID: dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1] obj = self.state.object_table(dummy_object_id) manager_ids = obj["ManagerIDs"] if manager_ids is not None: # The dummy object should exist on at most one plasma # manager, the manager associated with the local scheduler # that died. assert len(manager_ids) <= 1 # Remove the dummy object from the plasma manager # associated with the dead local scheduler, if any. for manager in manager_ids: ok = self.state._execute_command( dummy_object_id, "RAY.OBJECT_TABLE_REMOVE", dummy_object_id.id(), hex_to_binary(manager)) if ok != b"OK": log.warn("Failed to remove object location for " "dead plasma manager.") # If the task is scheduled on a dead local scheduler, mark the # task as lost. key = binary_to_object_id(hex_to_binary(task_id)) ok = self.state._execute_command( key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id), ray.experimental.state.TASK_STATUS_LOST, NIL_ID, task["ExecutionDependenciesString"], task["SpillbackCount"]) if ok != b"OK": log.warn("Failed to update lost task for dead scheduler.") num_tasks_updated += 1 if num_tasks_updated > 0: log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def profile_table(self): profile_table_keys = self._keys( ray.gcs_utils.TablePrefix_PROFILE_string + "*") component_identifiers_binary = [ key[len(ray.gcs_utils.TablePrefix_PROFILE_string):] for key in profile_table_keys ] return { binary_to_hex(component_id): self._profile_table(binary_to_object_id(component_id)) for component_id in component_identifiers_binary }
def profile_table(self): profile_table_keys = self._keys( ray.gcs_utils.TablePrefix_PROFILE_string + "*") batch_identifiers_binary = [ key[len(ray.gcs_utils.TablePrefix_PROFILE_string):] for key in profile_table_keys ] result = defaultdict(list) for batch_id in batch_identifiers_binary: profile_data = self._profile_table(binary_to_object_id(batch_id)) # Note that if keys are being evicted from Redis, then it is # possible that the batch will be evicted before we get it. if len(profile_data) > 0: component_id = profile_data[0]["component_id"] result[component_id].extend(profile_data) return dict(result)
def profile_table(self): profile_table_keys = self._keys( ray.gcs_utils.TablePrefix_PROFILE_string + "*") batch_identifiers_binary = [ key[len(ray.gcs_utils.TablePrefix_PROFILE_string):] for key in profile_table_keys ] result = defaultdict(list) for batch_id in batch_identifiers_binary: profile_data = self._profile_table(binary_to_object_id(batch_id)) # Note that if keys are being evicted from Redis, then it is # possible that the batch will be evicted before we get it. if len(profile_data) > 0: component_id = profile_data[0]["component_id"] result[component_id].extend(profile_data) return dict(result)
def cleanup_task_table(self): """Clean up global state for failed local schedulers. This marks any tasks that were scheduled on dead local schedulers as TASK_STATUS_LOST. A local scheduler is deemed dead if it is in self.dead_local_schedulers. """ tasks = self.state.task_table() num_tasks_updated = 0 for task_id, task in tasks.items(): # See if the corresponding local scheduler is alive. if task["LocalSchedulerID"] in self.dead_local_schedulers: # If the task is scheduled on a dead local scheduler, mark the task as # lost. key = binary_to_object_id(hex_to_binary(task_id)) ok = self.state._execute_command( key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id), ray.experimental.state.TASK_STATUS_LOST, NIL_ID) if ok != b"OK": log.warn("Failed to update lost task for dead scheduler.") num_tasks_updated += 1 if num_tasks_updated > 0: log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def _task_table(self, task_id_binary): """Fetch and parse the task table information for a single object task ID. Args: task_id_binary: A string of bytes with the task ID to get information about. Returns: A dictionary with information about the task ID in question. """ task_table_response = self.redis_client.execute_command( "RAY.TASK_TABLE_GET", task_id_binary) if task_table_response is None: raise Exception( "There is no entry for task ID {} in the task table.".format( binary_to_hex(task_id_binary))) task_table_message = TaskReply.GetRootAsTaskReply( task_table_response, 0) task_spec = task_table_message.TaskSpec() task_spec_message = TaskInfo.GetRootAsTaskInfo(task_spec, 0) args = [] for i in range(task_spec_message.ArgsLength()): arg = task_spec_message.Args(i) if len(arg.ObjectId()) != 0: args.append(binary_to_object_id(arg.ObjectId())) else: args.append(pickle.loads(arg.Data())) assert task_spec_message.RequiredResourcesLength() == 2 required_resources = { "CPUs": task_spec_message.RequiredResources(0), "GPUs": task_spec_message.RequiredResources(1) } task_spec_info = { "DriverID": binary_to_hex(task_spec_message.DriverId()), "TaskID": binary_to_hex(task_spec_message.TaskId()), "ParentTaskID": binary_to_hex(task_spec_message.ParentTaskId()), "ParentCounter": task_spec_message.ParentCounter(), "ActorID": binary_to_hex(task_spec_message.ActorId()), "ActorCounter": task_spec_message.ActorCounter(), "FunctionID": binary_to_hex(task_spec_message.FunctionId()), "Args": args, "ReturnObjectIDs": [ binary_to_object_id(task_spec_message.Returns(i)) for i in range(task_spec_message.ReturnsLength()) ], "RequiredResources": required_resources } return { "State": task_state_mapping[task_table_message.State()], "LocalSchedulerID": binary_to_hex(task_table_message.LocalSchedulerId()), "TaskSpec": task_spec_info }
def ToShardIndex(index): return binary_to_object_id(index).redis_shard_hash() % len( self.state.redis_clients)
def to_shard_index(id_bin): return binary_to_object_id(id_bin).redis_shard_hash() % len( self.state.redis_clients)
def ToShardIndex(index): return binary_to_object_id(index).redis_shard_hash() % len( self.state.redis_clients)
def to_shard_index(id_bin): return binary_to_object_id(id_bin).redis_shard_hash() % len( self.state.redis_clients)