Exemple #1
0
    def object_table(self, object_id=None):
        """Fetch and parse the object table info for one or more object IDs.

        Args:
            object_id: An object ID to fetch information about. If this is
                None, then the entire object table is fetched.


        Returns:
            Information from the object table.
        """
        self._check_connected()
        if object_id is not None:
            # Return information about a single object ID.
            return self._object_table(object_id)
        else:
            # Return the entire object table.
            object_info_keys = self._keys(OBJECT_INFO_PREFIX + "*")
            object_location_keys = self._keys(OBJECT_LOCATION_PREFIX + "*")
            object_ids_binary = set(
                [key[len(OBJECT_INFO_PREFIX):] for key in object_info_keys] +
                [key[len(OBJECT_LOCATION_PREFIX):]
                 for key in object_location_keys])
            results = {}
            for object_id_binary in object_ids_binary:
                results[binary_to_object_id(object_id_binary)] = (
                    self._object_table(binary_to_object_id(object_id_binary)))
            return results
Exemple #2
0
    def object_table(self, object_id=None):
        """Fetch and parse the object table info for one or more object IDs.

        Args:
            object_id: An object ID to fetch information about. If this is
                None, then the entire object table is fetched.

        Returns:
            Information from the object table.
        """
        self._check_connected()
        if object_id is not None:
            # Return information about a single object ID.
            return self._object_table(object_id)
        else:
            # Return the entire object table.
            object_keys = self._keys(gcs_utils.TablePrefix_OBJECT_string + "*")
            object_ids_binary = {
                key[len(gcs_utils.TablePrefix_OBJECT_string):]
                for key in object_keys
            }

            results = {}
            for object_id_binary in object_ids_binary:
                results[binary_to_object_id(object_id_binary)] = (
                    self._object_table(binary_to_object_id(object_id_binary)))
            return results
Exemple #3
0
    def object_table(self, object_id=None):
        """Fetch and parse the object table info for one or more object IDs.

        Args:
            object_id: An object ID to fetch information about. If this is
                None, then the entire object table is fetched.


        Returns:
            Information from the object table.
        """
        self._check_connected()
        if object_id is not None:
            # Return information about a single object ID.
            return self._object_table(object_id)
        else:
            # Return the entire object table.
            object_info_keys = self._keys(OBJECT_INFO_PREFIX + "*")
            object_location_keys = self._keys(OBJECT_LOCATION_PREFIX + "*")
            object_ids_binary = set(
                [key[len(OBJECT_INFO_PREFIX):] for key in object_info_keys] + [
                    key[len(OBJECT_LOCATION_PREFIX):]
                    for key in object_location_keys
                ])
            results = {}
            for object_id_binary in object_ids_binary:
                results[binary_to_object_id(object_id_binary)] = (
                    self._object_table(binary_to_object_id(object_id_binary)))
            return results
Exemple #4
0
    def object_table(self, object_id=None):
        """Fetch and parse the object table info for one or more object IDs.

        Args:
            object_id: An object ID to fetch information about. If this is
                None, then the entire object table is fetched.

        Returns:
            Information from the object table.
        """
        self._check_connected()
        if object_id is not None:
            # Return information about a single object ID.
            return self._object_table(object_id)
        else:
            # Return the entire object table.
            object_keys = self._keys(ray.gcs_utils.TablePrefix_OBJECT_string +
                                     "*")
            object_ids_binary = {
                key[len(ray.gcs_utils.TablePrefix_OBJECT_string):]
                for key in object_keys
            }

            results = {}
            for object_id_binary in object_ids_binary:
                results[binary_to_object_id(object_id_binary)] = (
                    self._object_table(binary_to_object_id(object_id_binary)))
            return results
Exemple #5
0
    def _task_table(self, task_id):
        """Fetch and parse the task table information for a single task ID.

        Args:
            task_id_binary: A string of bytes with the task ID to get
                information about.

        Returns:
            A dictionary with information about the task ID in question.
                TASK_STATUS_MAPPING should be used to parse the "State" field
                into a human-readable string.
        """
        task_table_response = self._execute_command(task_id,
                                                    "RAY.TASK_TABLE_GET",
                                                    task_id.id())
        if task_table_response is None:
            raise Exception("There is no entry for task ID {} in the task "
                            "table.".format(binary_to_hex(task_id.id())))
        task_table_message = TaskReply.GetRootAsTaskReply(task_table_response,
                                                          0)
        task_spec = task_table_message.TaskSpec()
        task_spec_message = TaskInfo.GetRootAsTaskInfo(task_spec, 0)
        args = []
        for i in range(task_spec_message.ArgsLength()):
            arg = task_spec_message.Args(i)
            if len(arg.ObjectId()) != 0:
                args.append(binary_to_object_id(arg.ObjectId()))
            else:
                args.append(pickle.loads(arg.Data()))
        # TODO(atumanov): Instead of hard coding these indices, we should use
        # the flatbuffer constants.
        assert task_spec_message.RequiredResourcesLength() == 3
        required_resources = {
            "CPUs": task_spec_message.RequiredResources(0),
            "GPUs": task_spec_message.RequiredResources(1),
            "CustomResource": task_spec_message.RequiredResources(2)}
        task_spec_info = {
            "DriverID": binary_to_hex(task_spec_message.DriverId()),
            "TaskID": binary_to_hex(task_spec_message.TaskId()),
            "ParentTaskID": binary_to_hex(task_spec_message.ParentTaskId()),
            "ParentCounter": task_spec_message.ParentCounter(),
            "ActorID": binary_to_hex(task_spec_message.ActorId()),
            "ActorCounter": task_spec_message.ActorCounter(),
            "FunctionID": binary_to_hex(task_spec_message.FunctionId()),
            "Args": args,
            "ReturnObjectIDs": [binary_to_object_id(
                                    task_spec_message.Returns(i))
                                for i in range(
                                    task_spec_message.ReturnsLength())],
            "RequiredResources": required_resources}

        return {"State": task_table_message.State(),
                "LocalSchedulerID": binary_to_hex(
                    task_table_message.LocalSchedulerId()),
                "TaskSpec": task_spec_info}
Exemple #6
0
 def to_shard_index(id_bin):
     if len(id_bin) == ray.TaskID.size():
         return binary_to_task_id(id_bin).redis_shard_hash() % len(
             ray.state.state.redis_clients)
     else:
         return binary_to_object_id(id_bin).redis_shard_hash() % len(
             ray.state.state.redis_clients)
Exemple #7
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

        This marks any tasks that were scheduled on dead local schedulers as
        TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
        self.dead_local_schedulers.
        """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] not in self.dead_local_schedulers:
                continue

            # Remove dummy objects returned by actor tasks from any plasma
            # manager. Although the objects may still exist in that object
            # store, this deletion makes them effectively unreachable by any
            # local scheduler connected to a different store.
            # TODO(swang): Actually remove the objects from the object store,
            # so that the reconstructed actor can reuse the same object store.
            if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID:
                dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1]
                obj = self.state.object_table(dummy_object_id)
                manager_ids = obj["ManagerIDs"]
                if manager_ids is not None:
                    # The dummy object should exist on at most one plasma
                    # manager, the manager associated with the local scheduler
                    # that died.
                    assert len(manager_ids) <= 1
                    # Remove the dummy object from the plasma manager
                    # associated with the dead local scheduler, if any.
                    for manager in manager_ids:
                        ok = self.state._execute_command(
                            dummy_object_id, "RAY.OBJECT_TABLE_REMOVE",
                            dummy_object_id.id(), hex_to_binary(manager))
                        if ok != b"OK":
                            log.warn("Failed to remove object location for "
                                     "dead plasma manager.")

            # If the task is scheduled on a dead local scheduler, mark the
            # task as lost.
            key = binary_to_object_id(hex_to_binary(task_id))
            ok = self.state._execute_command(
                key, "RAY.TASK_TABLE_UPDATE",
                hex_to_binary(task_id),
                ray.experimental.state.TASK_STATUS_LOST, NIL_ID,
                task["ExecutionDependenciesString"],
                task["SpillbackCount"])
            if ok != b"OK":
                log.warn("Failed to update lost task for dead scheduler.")
            num_tasks_updated += 1

        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Exemple #8
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

        This marks any tasks that were scheduled on dead local schedulers as
        TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
        self.dead_local_schedulers.
        """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] not in self.dead_local_schedulers:
                continue

            # Remove dummy objects returned by actor tasks from any plasma
            # manager. Although the objects may still exist in that object
            # store, this deletion makes them effectively unreachable by any
            # local scheduler connected to a different store.
            # TODO(swang): Actually remove the objects from the object store,
            # so that the reconstructed actor can reuse the same object store.
            if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID:
                dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1]
                obj = self.state.object_table(dummy_object_id)
                manager_ids = obj["ManagerIDs"]
                if manager_ids is not None:
                    # The dummy object should exist on at most one plasma
                    # manager, the manager associated with the local scheduler
                    # that died.
                    assert len(manager_ids) <= 1
                    # Remove the dummy object from the plasma manager
                    # associated with the dead local scheduler, if any.
                    for manager in manager_ids:
                        ok = self.state._execute_command(
                            dummy_object_id, "RAY.OBJECT_TABLE_REMOVE",
                            dummy_object_id.id(), hex_to_binary(manager))
                        if ok != b"OK":
                            log.warn("Failed to remove object location for "
                                     "dead plasma manager.")

            # If the task is scheduled on a dead local scheduler, mark the
            # task as lost.
            key = binary_to_object_id(hex_to_binary(task_id))
            ok = self.state._execute_command(
                key, "RAY.TASK_TABLE_UPDATE",
                hex_to_binary(task_id),
                ray.experimental.state.TASK_STATUS_LOST, NIL_ID,
                task["ExecutionDependenciesString"],
                task["SpillbackCount"])
            if ok != b"OK":
                log.warn("Failed to update lost task for dead scheduler.")
            num_tasks_updated += 1

        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Exemple #9
0
    def profile_table(self):
        profile_table_keys = self._keys(
            ray.gcs_utils.TablePrefix_PROFILE_string + "*")
        component_identifiers_binary = [
            key[len(ray.gcs_utils.TablePrefix_PROFILE_string):]
            for key in profile_table_keys
        ]

        return {
            binary_to_hex(component_id):
            self._profile_table(binary_to_object_id(component_id))
            for component_id in component_identifiers_binary
        }
Exemple #10
0
    def profile_table(self):
        profile_table_keys = self._keys(
            ray.gcs_utils.TablePrefix_PROFILE_string + "*")
        batch_identifiers_binary = [
            key[len(ray.gcs_utils.TablePrefix_PROFILE_string):]
            for key in profile_table_keys
        ]

        result = defaultdict(list)
        for batch_id in batch_identifiers_binary:
            profile_data = self._profile_table(binary_to_object_id(batch_id))
            # Note that if keys are being evicted from Redis, then it is
            # possible that the batch will be evicted before we get it.
            if len(profile_data) > 0:
                component_id = profile_data[0]["component_id"]
                result[component_id].extend(profile_data)

        return dict(result)
Exemple #11
0
    def profile_table(self):
        profile_table_keys = self._keys(
            ray.gcs_utils.TablePrefix_PROFILE_string + "*")
        batch_identifiers_binary = [
            key[len(ray.gcs_utils.TablePrefix_PROFILE_string):]
            for key in profile_table_keys
        ]

        result = defaultdict(list)
        for batch_id in batch_identifiers_binary:
            profile_data = self._profile_table(binary_to_object_id(batch_id))
            # Note that if keys are being evicted from Redis, then it is
            # possible that the batch will be evicted before we get it.
            if len(profile_data) > 0:
                component_id = profile_data[0]["component_id"]
                result[component_id].extend(profile_data)

        return dict(result)
Exemple #12
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

    This marks any tasks that were scheduled on dead local schedulers as
    TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
    self.dead_local_schedulers.
    """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] in self.dead_local_schedulers:
                # If the task is scheduled on a dead local scheduler, mark the task as
                # lost.
                key = binary_to_object_id(hex_to_binary(task_id))
                ok = self.state._execute_command(
                    key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id),
                    ray.experimental.state.TASK_STATUS_LOST, NIL_ID)
                if ok != b"OK":
                    log.warn("Failed to update lost task for dead scheduler.")
                num_tasks_updated += 1
        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Exemple #13
0
    def _task_table(self, task_id_binary):
        """Fetch and parse the task table information for a single object task ID.

    Args:
      task_id_binary: A string of bytes with the task ID to get information
        about.

    Returns:
      A dictionary with information about the task ID in question.
    """
        task_table_response = self.redis_client.execute_command(
            "RAY.TASK_TABLE_GET", task_id_binary)
        if task_table_response is None:
            raise Exception(
                "There is no entry for task ID {} in the task table.".format(
                    binary_to_hex(task_id_binary)))
        task_table_message = TaskReply.GetRootAsTaskReply(
            task_table_response, 0)
        task_spec = task_table_message.TaskSpec()
        task_spec_message = TaskInfo.GetRootAsTaskInfo(task_spec, 0)
        args = []
        for i in range(task_spec_message.ArgsLength()):
            arg = task_spec_message.Args(i)
            if len(arg.ObjectId()) != 0:
                args.append(binary_to_object_id(arg.ObjectId()))
            else:
                args.append(pickle.loads(arg.Data()))
        assert task_spec_message.RequiredResourcesLength() == 2
        required_resources = {
            "CPUs": task_spec_message.RequiredResources(0),
            "GPUs": task_spec_message.RequiredResources(1)
        }
        task_spec_info = {
            "DriverID":
            binary_to_hex(task_spec_message.DriverId()),
            "TaskID":
            binary_to_hex(task_spec_message.TaskId()),
            "ParentTaskID":
            binary_to_hex(task_spec_message.ParentTaskId()),
            "ParentCounter":
            task_spec_message.ParentCounter(),
            "ActorID":
            binary_to_hex(task_spec_message.ActorId()),
            "ActorCounter":
            task_spec_message.ActorCounter(),
            "FunctionID":
            binary_to_hex(task_spec_message.FunctionId()),
            "Args":
            args,
            "ReturnObjectIDs": [
                binary_to_object_id(task_spec_message.Returns(i))
                for i in range(task_spec_message.ReturnsLength())
            ],
            "RequiredResources":
            required_resources
        }

        return {
            "State":
            task_state_mapping[task_table_message.State()],
            "LocalSchedulerID":
            binary_to_hex(task_table_message.LocalSchedulerId()),
            "TaskSpec":
            task_spec_info
        }
Exemple #14
0
 def ToShardIndex(index):
     return binary_to_object_id(index).redis_shard_hash() % len(
         self.state.redis_clients)
Exemple #15
0
 def to_shard_index(id_bin):
     return binary_to_object_id(id_bin).redis_shard_hash() % len(
         self.state.redis_clients)
Exemple #16
0
 def ToShardIndex(index):
     return binary_to_object_id(index).redis_shard_hash() % len(
         self.state.redis_clients)
Exemple #17
0
 def to_shard_index(id_bin):
     return binary_to_object_id(id_bin).redis_shard_hash() % len(
         self.state.redis_clients)