Example #1
0
    def cleanup_actors(self):
        """Recreate any live actors whose corresponding local scheduler died.

        For any live actor whose local scheduler just died, we choose a new
        local scheduler and broadcast a notification to create that actor.
        """
        actor_info = self.state.actors()
        for actor_id, info in actor_info.items():
            if (not info["removed"] and info["local_scheduler_id"]
                    in self.dead_local_schedulers):
                # Choose a new local scheduler to run the actor.
                local_scheduler_id = ray.utils.select_local_scheduler(
                    info["driver_id"], self.state.local_schedulers(),
                    info["num_gpus"], self.redis)
                import sys
                sys.stdout.flush()
                # The new local scheduler should not be the same as the old
                # local scheduler. TODO(rkn): This should not be an assert, it
                # should be something more benign.
                assert (binary_to_hex(local_scheduler_id) !=
                        info["local_scheduler_id"])
                # Announce to all of the local schedulers that the actor should
                # be recreated on this new local scheduler.
                ray.utils.publish_actor_creation(
                    hex_to_binary(actor_id), hex_to_binary(info["driver_id"]),
                    local_scheduler_id, True, self.redis)
                log.info(
                    "Actor {} for driver {} was on dead local scheduler "
                    "{}. It is being recreated on local scheduler {}".format(
                        actor_id, info["driver_id"],
                        info["local_scheduler_id"],
                        binary_to_hex(local_scheduler_id)))
                # Update the actor info in Redis.
                self.redis.hset(b"Actor:" + hex_to_binary(actor_id),
                                "local_scheduler_id", local_scheduler_id)
Example #2
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

        This marks any tasks that were scheduled on dead local schedulers as
        TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
        self.dead_local_schedulers.
        """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] not in self.dead_local_schedulers:
                continue

            # Remove dummy objects returned by actor tasks from any plasma
            # manager. Although the objects may still exist in that object
            # store, this deletion makes them effectively unreachable by any
            # local scheduler connected to a different store.
            # TODO(swang): Actually remove the objects from the object store,
            # so that the reconstructed actor can reuse the same object store.
            if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID:
                dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1]
                obj = self.state.object_table(dummy_object_id)
                manager_ids = obj["ManagerIDs"]
                if manager_ids is not None:
                    # The dummy object should exist on at most one plasma
                    # manager, the manager associated with the local scheduler
                    # that died.
                    assert len(manager_ids) <= 1
                    # Remove the dummy object from the plasma manager
                    # associated with the dead local scheduler, if any.
                    for manager in manager_ids:
                        ok = self.state._execute_command(
                            dummy_object_id, "RAY.OBJECT_TABLE_REMOVE",
                            dummy_object_id.id(), hex_to_binary(manager))
                        if ok != b"OK":
                            log.warn("Failed to remove object location for "
                                     "dead plasma manager.")

            # If the task is scheduled on a dead local scheduler, mark the
            # task as lost.
            key = binary_to_object_id(hex_to_binary(task_id))
            ok = self.state._execute_command(
                key, "RAY.TASK_TABLE_UPDATE",
                hex_to_binary(task_id),
                ray.experimental.state.TASK_STATUS_LOST, NIL_ID,
                task["ExecutionDependenciesString"],
                task["SpillbackCount"])
            if ok != b"OK":
                log.warn("Failed to update lost task for dead scheduler.")
            num_tasks_updated += 1

        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Example #3
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

        This marks any tasks that were scheduled on dead local schedulers as
        TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
        self.dead_local_schedulers.
        """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] not in self.dead_local_schedulers:
                continue

            # Remove dummy objects returned by actor tasks from any plasma
            # manager. Although the objects may still exist in that object
            # store, this deletion makes them effectively unreachable by any
            # local scheduler connected to a different store.
            # TODO(swang): Actually remove the objects from the object store,
            # so that the reconstructed actor can reuse the same object store.
            if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID:
                dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1]
                obj = self.state.object_table(dummy_object_id)
                manager_ids = obj["ManagerIDs"]
                if manager_ids is not None:
                    # The dummy object should exist on at most one plasma
                    # manager, the manager associated with the local scheduler
                    # that died.
                    assert len(manager_ids) <= 1
                    # Remove the dummy object from the plasma manager
                    # associated with the dead local scheduler, if any.
                    for manager in manager_ids:
                        ok = self.state._execute_command(
                            dummy_object_id, "RAY.OBJECT_TABLE_REMOVE",
                            dummy_object_id.id(), hex_to_binary(manager))
                        if ok != b"OK":
                            log.warn("Failed to remove object location for "
                                     "dead plasma manager.")

            # If the task is scheduled on a dead local scheduler, mark the
            # task as lost.
            key = binary_to_object_id(hex_to_binary(task_id))
            ok = self.state._execute_command(
                key, "RAY.TASK_TABLE_UPDATE",
                hex_to_binary(task_id),
                ray.experimental.state.TASK_STATUS_LOST, NIL_ID,
                task["ExecutionDependenciesString"],
                task["SpillbackCount"])
            if ok != b"OK":
                log.warn("Failed to update lost task for dead scheduler.")
            num_tasks_updated += 1

        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Example #4
0
    def _object_table(self, object_id):
        """Fetch and parse the object table information for a single object ID.

        Args:
            object_id: An object ID to get information about.

        Returns:
            A dictionary with information about the object ID in question.
        """
        # Allow the argument to be either an ObjectID or a hex string.
        if not isinstance(object_id, ray.ObjectID):
            object_id = ray.ObjectID(hex_to_binary(object_id))

        # Return information about a single object ID.
        message = self._execute_command(object_id, "RAY.TABLE_LOOKUP",
                                        ray.gcs_utils.TablePrefix.OBJECT, "",
                                        object_id.binary())
        if message is None:
            return {}
        gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
            message, 0)

        assert gcs_entry.EntriesLength() > 0

        entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData(
            gcs_entry.Entries(0), 0)

        object_info = {
            "DataSize": entry.ObjectSize(),
            "Manager": entry.Manager(),
        }

        return object_info
Example #5
0
    def task_table(self, task_id=None):
        """Fetch and parse the task table information for one or more task IDs.

        Args:
            task_id: A hex string of the task ID to fetch information about. If
                this is None, then the task object table is fetched.

        Returns:
            Information from the task table.
        """
        self._check_connected()
        if task_id is not None:
            task_id = ray.TaskID(hex_to_binary(task_id))
            return self._task_table(task_id)
        else:
            task_table_keys = self._keys(
                ray.gcs_utils.TablePrefix_RAYLET_TASK_string + "*")
            task_ids_binary = [
                key[len(ray.gcs_utils.TablePrefix_RAYLET_TASK_string):]
                for key in task_table_keys
            ]

            results = {}
            for task_id_binary in task_ids_binary:
                results[binary_to_hex(task_id_binary)] = self._task_table(
                    ray.TaskID(task_id_binary))
            return results
Example #6
0
    def _object_table(self, object_id):
        """Fetch and parse the object table information for a single object ID.

        Args:
            object_id: An object ID to get information about.

        Returns:
            A dictionary with information about the object ID in question.
        """
        # Allow the argument to be either an ObjectID or a hex string.
        if not isinstance(object_id, ray.ObjectID):
            object_id = ray.ObjectID(hex_to_binary(object_id))

        # Return information about a single object ID.
        message = self._execute_command(object_id, "RAY.TABLE_LOOKUP",
                                        ray.gcs_utils.TablePrefix.OBJECT, "",
                                        object_id.binary())
        if message is None:
            return {}
        gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
            message, 0)

        assert gcs_entry.EntriesLength() > 0

        entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData(
            gcs_entry.Entries(0), 0)

        object_info = {
            "DataSize": entry.ObjectSize(),
            "Manager": entry.Manager(),
        }

        return object_info
Example #7
0
    def _xray_clean_up_entries_for_driver(self, driver_id):
        """Remove this driver's object/task entries from redis.

        Removes control-state entries of all tasks and task return
        objects belonging to the driver.

        Args:
            driver_id: The driver id.
        """

        xray_task_table_prefix = (
            ray.gcs_utils.TablePrefix_RAYLET_TASK_string.encode("ascii"))
        xray_object_table_prefix = (
            ray.gcs_utils.TablePrefix_OBJECT_string.encode("ascii"))

        task_table_objects = self.state.task_table()
        driver_id_hex = binary_to_hex(driver_id)
        driver_task_id_bins = set()
        for task_id_hex, task_info in task_table_objects.items():
            task_table_object = task_info["TaskSpec"]
            task_driver_id_hex = task_table_object["DriverID"]
            if driver_id_hex != task_driver_id_hex:
                # Ignore tasks that aren't from this driver.
                continue
            driver_task_id_bins.add(hex_to_binary(task_id_hex))

        # Get objects associated with the driver.
        object_table_objects = self.state.object_table()
        driver_object_id_bins = set()
        for object_id, _ in object_table_objects.items():
            task_id_bin = ray.raylet.compute_task_id(object_id).id()
            if task_id_bin in driver_task_id_bins:
                driver_object_id_bins.add(object_id.id())

        def to_shard_index(id_bin):
            return binary_to_object_id(id_bin).redis_shard_hash() % len(
                self.state.redis_clients)

        # Form the redis keys to delete.
        sharded_keys = [[] for _ in range(len(self.state.redis_clients))]
        for task_id_bin in driver_task_id_bins:
            sharded_keys[to_shard_index(task_id_bin)].append(
                xray_task_table_prefix + task_id_bin)
        for object_id_bin in driver_object_id_bins:
            sharded_keys[to_shard_index(object_id_bin)].append(
                xray_object_table_prefix + object_id_bin)

        # Remove with best effort.
        for shard_index in range(len(sharded_keys)):
            keys = sharded_keys[shard_index]
            if len(keys) == 0:
                continue
            redis = self.state.redis_clients[shard_index]
            num_deleted = redis.delete(*keys)
            logger.info("Removed {} dead redis entries of the driver from"
                        " redis shard {}.".format(num_deleted, shard_index))
            if num_deleted != len(keys):
                logger.warning("Failed to remove {} relevant redis entries"
                               " from redis shard {}.".format(
                                   len(keys) - num_deleted, shard_index))
Example #8
0
    def cleanup_object_table(self):
        """Clean up global state for failed plasma managers.

        This removes dead plasma managers from any location entries in the
        object table. A plasma manager is deemed dead if it is in
        self.dead_plasma_managers.
        """
        # TODO(swang): Also kill the associated plasma store, since it's no
        # longer reachable without a plasma manager.
        objects = self.state.object_table()
        num_objects_removed = 0
        for object_id, obj in objects.items():
            manager_ids = obj["ManagerIDs"]
            if manager_ids is None:
                continue
            for manager in manager_ids:
                if manager in self.dead_plasma_managers:
                    # If the object was on a dead plasma manager, remove that
                    # location entry.
                    ok = self.state._execute_command(
                        object_id, "RAY.OBJECT_TABLE_REMOVE", object_id.id(),
                        hex_to_binary(manager))
                    if ok != b"OK":
                        log.warn("Failed to remove object location for dead "
                                 "plasma manager.")
                    num_objects_removed += 1
        if num_objects_removed > 0:
            log.warn("Marked {} objects as lost.".format(num_objects_removed))
Example #9
0
    def cleanup_object_table(self):
        """Clean up global state for failed plasma managers.

        This removes dead plasma managers from any location entries in the
        object table. A plasma manager is deemed dead if it is in
        self.dead_plasma_managers.
        """
        # TODO(swang): Also kill the associated plasma store, since it's no
        # longer reachable without a plasma manager.
        objects = self.state.object_table()
        num_objects_removed = 0
        for object_id, obj in objects.items():
            manager_ids = obj["ManagerIDs"]
            if manager_ids is None:
                continue
            for manager in manager_ids:
                if manager in self.dead_plasma_managers:
                    # If the object was on a dead plasma manager, remove that
                    # location entry.
                    ok = self.state._execute_command(object_id,
                                                     "RAY.OBJECT_TABLE_REMOVE",
                                                     object_id.id(),
                                                     hex_to_binary(manager))
                    if ok != b"OK":
                        log.warn("Failed to remove object location for dead "
                                 "plasma manager.")
                    num_objects_removed += 1
        if num_objects_removed > 0:
            log.warn("Marked {} objects as lost.".format(num_objects_removed))
Example #10
0
    def actor_table(self, actor_id):
        """Fetch and parse the actor table information for a single actor ID.

        Args:
            actor_id: A hex string of the actor ID to fetch information about.
                If this is None, then the actor table is fetched.

        Returns:
            Information from the actor table.
        """
        self._check_connected()

        if actor_id is not None:
            actor_id = ray.ActorID(hex_to_binary(actor_id))
            actor_info = self.global_state_accessor.get_actor_info(actor_id)
            if actor_info is None:
                return {}
            else:
                actor_table_data = gcs_utils.ActorTableData.FromString(
                    actor_info)
                return self._gen_actor_info(actor_table_data)
        else:
            actor_table = self.global_state_accessor.get_actor_table()
            results = {}
            for i in range(len(actor_table)):
                actor_table_data = gcs_utils.ActorTableData.FromString(
                    actor_table[i])
                results[binary_to_hex(actor_table_data.actor_id)] = \
                    self._gen_actor_info(actor_table_data)

            return results
Example #11
0
    def placement_group_table(self, placement_group_id=None):
        self._check_connected()

        if placement_group_id is not None:
            placement_group_id = ray.PlacementGroupID(
                hex_to_binary(placement_group_id.hex()))
            placement_group_info = (
                self.global_state_accessor.get_placement_group_info(
                    placement_group_id))
            if placement_group_info is None:
                return {}
            else:
                placement_group_info = (gcs_utils.PlacementGroupTableData.
                                        FromString(placement_group_info))
                return self._gen_placement_group_info(placement_group_info)
        else:
            placement_group_table = self.global_state_accessor.\
                                    get_placement_group_table()
            results = {}
            for placement_group_info in placement_group_table:
                placement_group_table_data = gcs_utils.\
                    PlacementGroupTableData.FromString(placement_group_info)
                placement_group_id = binary_to_hex(
                    placement_group_table_data.placement_group_id)
                results[placement_group_id] = \
                    self._gen_placement_group_info(placement_group_table_data)

            return results
Example #12
0
    def task_table(self, task_id=None):
        """Fetch and parse the task table information for one or more task IDs.

        Args:
            task_id: A hex string of the task ID to fetch information about. If
                this is None, then the task object table is fetched.

        Returns:
            Information from the task table.
        """
        self._check_connected()
        if task_id is not None:
            task_id = ray.TaskID(hex_to_binary(task_id))
            return self._task_table(task_id)
        else:
            task_table_keys = self._keys(
                gcs_utils.TablePrefix_RAYLET_TASK_string + "*")
            task_ids_binary = [
                key[len(gcs_utils.TablePrefix_RAYLET_TASK_string):]
                for key in task_table_keys
            ]

            results = {}
            for task_id_binary in task_ids_binary:
                results[binary_to_hex(task_id_binary)] = self._task_table(
                    ray.TaskID(task_id_binary))
            return results
Example #13
0
    def _xray_clean_up_entries_for_driver(self, driver_id):
        """Remove this driver's object/task entries from redis.

        Removes control-state entries of all tasks and task return
        objects belonging to the driver.

        Args:
            driver_id: The driver id.
        """

        xray_task_table_prefix = (
            ray.gcs_utils.TablePrefix_RAYLET_TASK_string.encode("ascii"))
        xray_object_table_prefix = (
            ray.gcs_utils.TablePrefix_OBJECT_string.encode("ascii"))

        task_table_objects = self.state.task_table()
        driver_id_hex = binary_to_hex(driver_id)
        driver_task_id_bins = set()
        for task_id_hex, task_info in task_table_objects.items():
            task_table_object = task_info["TaskSpec"]
            task_driver_id_hex = task_table_object["DriverID"]
            if driver_id_hex != task_driver_id_hex:
                # Ignore tasks that aren't from this driver.
                continue
            driver_task_id_bins.add(hex_to_binary(task_id_hex))

        # Get objects associated with the driver.
        object_table_objects = self.state.object_table()
        driver_object_id_bins = set()
        for object_id, _ in object_table_objects.items():
            task_id_bin = ray.raylet.compute_task_id(object_id).id()
            if task_id_bin in driver_task_id_bins:
                driver_object_id_bins.add(object_id.id())

        def to_shard_index(id_bin):
            return binary_to_object_id(id_bin).redis_shard_hash() % len(
                self.state.redis_clients)

        # Form the redis keys to delete.
        sharded_keys = [[] for _ in range(len(self.state.redis_clients))]
        for task_id_bin in driver_task_id_bins:
            sharded_keys[to_shard_index(task_id_bin)].append(
                xray_task_table_prefix + task_id_bin)
        for object_id_bin in driver_object_id_bins:
            sharded_keys[to_shard_index(object_id_bin)].append(
                xray_object_table_prefix + object_id_bin)

        # Remove with best effort.
        for shard_index in range(len(sharded_keys)):
            keys = sharded_keys[shard_index]
            if len(keys) == 0:
                continue
            redis = self.state.redis_clients[shard_index]
            num_deleted = redis.delete(*keys)
            logger.info("Removed {} dead redis entries of the driver from"
                        " redis shard {}.".format(num_deleted, shard_index))
            if num_deleted != len(keys):
                logger.warning("Failed to remove {} relevant redis entries"
                               " from redis shard {}.".format(
                                   len(keys) - num_deleted, shard_index))
Example #14
0
    def object_table(self, object_ref=None):
        """Fetch and parse the object table info for one or more object refs.

        Args:
            object_ref: An object ref to fetch information about. If this is
                None, then the entire object table is fetched.

        Returns:
            Information from the object table.
        """
        self._check_connected()

        if object_ref is not None:
            object_ref = ray.ObjectRef(hex_to_binary(object_ref))
            object_info = self.global_state_accessor.get_object_info(
                object_ref)
            if object_info is None:
                return {}
            else:
                object_location_info = gcs_utils.ObjectLocationInfo.FromString(
                    object_info)
                return self._gen_object_info(object_location_info)
        else:
            object_table = self.global_state_accessor.get_object_table()
            results = {}
            for i in range(len(object_table)):
                object_location_info = gcs_utils.ObjectLocationInfo.FromString(
                    object_table[i])
                results[binary_to_hex(object_location_info.object_id)] = \
                    self._gen_object_info(object_location_info)
            return results
Example #15
0
    def _object_table(self, object_id):
        """Fetch and parse the object table information for a single object ID.

        Args:
            object_id: An object ID to get information about.

        Returns:
            A dictionary with information about the object ID in question.
        """
        # Allow the argument to be either an ObjectID or a hex string.
        if not isinstance(object_id, ray.ObjectID):
            object_id = ray.ObjectID(hex_to_binary(object_id))

        # Return information about a single object ID.
        message = self._execute_command(object_id, "RAY.TABLE_LOOKUP",
                                        gcs_utils.TablePrefix.Value("OBJECT"),
                                        "", object_id.binary())
        if message is None:
            return {}
        gcs_entry = gcs_utils.GcsEntry.FromString(message)

        assert len(gcs_entry.entries) > 0

        entry = gcs_utils.ObjectTableData.FromString(gcs_entry.entries[0])

        object_info = {
            "DataSize": entry.object_size,
            "Manager": entry.manager,
        }

        return object_info
Example #16
0
    def actor_table(self, actor_id=None):
        """Fetch and parse the actor table information for one or more actor IDs.

        Args:
            actor_id: A hex string of the actor ID to fetch information about.
                If this is None, then the actor table is fetched.

        Returns:
            Information from the actor table.
        """
        self._check_connected()
        if actor_id is not None:
            actor_id = ray.ActorID(hex_to_binary(actor_id))
            return self._actor_table(actor_id)
        else:
            actor_table_keys = list(
                self.redis_client.scan_iter(
                    match=gcs_utils.TablePrefix_ACTOR_string + "*"))
            actor_ids_binary = [
                key[len(gcs_utils.TablePrefix_ACTOR_string):]
                for key in actor_table_keys
            ]

            results = {}
            for actor_id_binary in actor_ids_binary:
                results[binary_to_hex(actor_id_binary)] = self._actor_table(
                    ray.ActorID(actor_id_binary))
            return results
Example #17
0
def select_local_scheduler(local_schedulers, num_gpus, worker):
    """Select a local scheduler to assign this actor to.

    Args:
        local_schedulers: A list of dictionaries of information about the local
            schedulers.
        num_gpus (int): The number of GPUs that must be reserved for this
            actor.

    Returns:
        The ID of the local scheduler that has been chosen.

    Raises:
        Exception: An exception is raised if no local scheduler can be found
            with sufficient resources.
    """
    driver_id = worker.task_driver_id.id()

    local_scheduler_id = None
    # Loop through all of the local schedulers in a random order.
    local_schedulers = np.random.permutation(local_schedulers)
    for local_scheduler in local_schedulers:
        if local_scheduler["NumCPUs"] < 1:
            continue
        if local_scheduler["NumGPUs"] < num_gpus:
            continue
        if num_gpus == 0:
            local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"])
            break
        else:
            # Try to reserve enough GPUs on this local scheduler.
            success = attempt_to_reserve_gpus(num_gpus, driver_id,
                                              local_scheduler, worker)
            if success:
                local_scheduler_id = hex_to_binary(
                    local_scheduler["DBClientID"])
                break

    if local_scheduler_id is None:
        raise Exception("Could not find a node with enough GPUs or other "
                        "resources to create this actor. The local scheduler "
                        "information is {}.".format(local_schedulers))

    return local_scheduler_id
Example #18
0
def select_local_scheduler(local_schedulers, num_gpus, worker):
  """Select a local scheduler to assign this actor to.

  Args:
    local_schedulers: A list of dictionaries of information about the local
      schedulers.
    num_gpus (int): The number of GPUs that must be reserved for this actor.

  Returns:
    A tuple of the ID of the local scheduler that has been chosen and a list of
      the gpu_ids that are reserved for the actor.

  Raises:
    Exception: An exception is raised if no local scheduler can be found with
      sufficient resources.
  """
  driver_id = worker.task_driver_id.id()

  if num_gpus == 0:
    local_scheduler_id = hex_to_binary(
        random.choice(local_schedulers)["DBClientID"])
    gpus_aquired = []
  else:
    # All of this logic is for finding a local scheduler that has enough
    # available GPUs.
    local_scheduler_id = None
    # Loop through all of the local schedulers.
    for local_scheduler in local_schedulers:
      # Try to reserve enough GPUs on this local scheduler.
      gpus_aquired = attempt_to_reserve_gpus(num_gpus, driver_id,
                                             local_scheduler, worker)
      if len(gpus_aquired) == num_gpus:
        local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"])
        break
      else:
        # We should have either acquired as many GPUs as we need or none.
        assert len(gpus_aquired) == 0

    if local_scheduler_id is None:
      raise Exception("Could not find a node with enough GPUs to create this "
                      "actor. The local scheduler information is {}."
                      .format(local_schedulers))
  return local_scheduler_id, gpus_aquired
Example #19
0
    def __setstate__(self, state):
        state["resources"] = json_to_resources(state["resources"])

        if state["status"] == Trial.RUNNING:
            state["status"] = Trial.PENDING
        for key in self._nonjson_fields:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        validate_trainable(self.trainable_name)
        self.init_logdir()  # Create logdir if it does not exist
Example #20
0
    def _check():
        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_summary = result["data"]["summary"]
        assert len(job_summary) == 1, resp.text
        one_job = job_summary[0]
        assert "jobId" in one_job
        job_id = one_job["jobId"]
        assert ray._raylet.JobID(hex_to_binary(one_job["jobId"]))
        assert "driverIpAddress" in one_job
        assert one_job["driverIpAddress"] == ip
        assert "driverPid" in one_job
        assert one_job["driverPid"] == str(os.getpid())
        assert "config" in one_job
        assert type(one_job["config"]) is dict
        assert "isDead" in one_job
        assert one_job["isDead"] is False
        assert "timestamp" in one_job
        one_job_summary_keys = one_job.keys()

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_detail = result["data"]["detail"]
        assert "jobInfo" in job_detail
        assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0
        assert "jobActors" in job_detail
        job_actors = job_detail["jobActors"]
        assert len(job_actors) == 1, resp.text
        one_job_actor = job_actors[actor_id]
        assert "taskSpec" in one_job_actor
        assert type(one_job_actor["taskSpec"]) is dict
        assert "functionDescriptor" in one_job_actor["taskSpec"]
        assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict
        assert "pid" in one_job_actor
        assert one_job_actor["pid"] == actor_pid
        check_actor_keys = [
            "name", "timestamp", "address", "actorId", "jobId", "state"
        ]
        for k in check_actor_keys:
            assert k in one_job_actor
        assert "jobWorkers" in job_detail
        job_workers = job_detail["jobWorkers"]
        assert len(job_workers) == 1, resp.text
        one_job_worker = job_workers[0]
        check_worker_keys = [
            "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent",
            "coreWorkerStats", "language", "jobId"
        ]
        for k in check_worker_keys:
            assert k in one_job_worker
Example #21
0
    def __setstate__(self, state):
        logger_started = state.pop("__logger_started__")
        state["resources"] = json_to_resources(state["resources"])
        for key in [
                "_checkpoint", "config", "custom_loggers", "sync_function"
        ]:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        Trial._registration_check(self.trainable_name)
        if logger_started:
            self.init_logger()
Example #22
0
    def __setstate__(self, state):
        logger_started = state.pop("__logger_started__")
        state["resources"] = json_to_resources(state["resources"])
        if state["status"] == Trial.RUNNING:
            state["status"] = Trial.PENDING
        for key in self._nonjson_fields:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        validate_trainable(self.trainable_name)
        if logger_started:
            self.init_logger()
Example #23
0
    def __setstate__(self, state):
        logger_started = state.pop("__logger_started__")
        state["resources"] = json_to_resources(state["resources"])
        if state["status"] == Trial.RUNNING:
            state["status"] = Trial.PENDING
        for key in self._nonjson_fields:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        Trial._registration_check(self.trainable_name)
        if logger_started:
            self.init_logger()
Example #24
0
    def __setstate__(self, state):
        logger_started = state.pop("__logger_started__")
        state["resources"] = json_to_resources(state["resources"])
        for key in [
                "_checkpoint", "config", "custom_loggers", "sync_function",
                "last_result"
        ]:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        Trial._registration_check(self.trainable_name)
        if logger_started:
            self.init_logger()
Example #25
0
def index():
    """TODO (dsuo): Add comments.

    TODO (dsuo): Maybe care about batching
    TODO (dsuo): REST is nice conceptually, but probably too much overhead
    TODO (dsuo): Move logic to C++ (keep as head node process)

    NOTE: We do an extra serialize during get and extra deserialize during
        put.
    """
    if request.method == 'POST':
        raw_object_id = request.files['object_id'].read()
        object_id = ray.pyarrow.plasma.ObjectID(raw_object_id)

        # NOTE (dsuo): we should use readinto in the future if possible.
        # Otherwise, we create a throwaway buffer when we read the whole
        # stream of data. Might look something like this:
        #
        #   request.files['value'].readinto(buf)
        #
        # Unfortunately, SpooledTemporaryFile request.files['value']
        # doesn't implement. See here: https://bugs.python.org/issue32600.
        data = request.files['value'].read()

        # Get a memoryview buffer of type unsigned bytes
        buf = memoryview(plasma_client.create(object_id, len(data))).cast("B")

        # Copy data into plasma buffer
        buf[:] = data

        plasma_client.seal(object_id)
        return raw_object_id, 402

    elif request.method == 'GET' and 'object_ids' in request.args:
        object_ids = [
            ray.pyarrow.plasma.ObjectID(hex_to_binary(object_id))
            for object_id in request.args['object_ids'].split(",")
        ]

        # Fetch remote objects
        # TODO (dsuo): maybe care about batching
        # NOTE: this is a really flaky test for "simple value"
        # NOTE: we don't support retrieving multiple objectIDs at a time
        data = plasma_client.get_buffers(object_ids)[0]

        # Return an appropriate return code?
        return send_file(io.BytesIO(data.to_pybytes()),
                         mimetype="application/octet-stream")
    else:
        return '''
Example #26
0
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

    This marks any tasks that were scheduled on dead local schedulers as
    TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
    self.dead_local_schedulers.
    """
        tasks = self.state.task_table()
        num_tasks_updated = 0
        for task_id, task in tasks.items():
            # See if the corresponding local scheduler is alive.
            if task["LocalSchedulerID"] in self.dead_local_schedulers:
                # If the task is scheduled on a dead local scheduler, mark the task as
                # lost.
                key = binary_to_object_id(hex_to_binary(task_id))
                ok = self.state._execute_command(
                    key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id),
                    ray.experimental.state.TASK_STATUS_LOST, NIL_ID)
                if ok != b"OK":
                    log.warn("Failed to update lost task for dead scheduler.")
                num_tasks_updated += 1
        if num_tasks_updated > 0:
            log.warn("Marked {} tasks as lost.".format(num_tasks_updated))
Example #27
0
    def cleanup_actors(self):
        """Recreate any live actors whose corresponding local scheduler died.

        For any live actor whose local scheduler just died, we choose a new
        local scheduler and broadcast a notification to create that actor.
        """
        actor_info = self.state.actors()
        for actor_id, info in actor_info.items():
            if (not info["removed"] and
                    info["local_scheduler_id"] in self.dead_local_schedulers):
                # Choose a new local scheduler to run the actor.
                local_scheduler_id = ray.utils.select_local_scheduler(
                    info["driver_id"],
                    self.state.local_schedulers(), info["num_gpus"],
                    self.redis)
                import sys
                sys.stdout.flush()
                # The new local scheduler should not be the same as the old
                # local scheduler. TODO(rkn): This should not be an assert, it
                # should be something more benign.
                assert (binary_to_hex(local_scheduler_id) !=
                        info["local_scheduler_id"])
                # Announce to all of the local schedulers that the actor should
                # be recreated on this new local scheduler.
                ray.utils.publish_actor_creation(
                    hex_to_binary(actor_id),
                    hex_to_binary(info["driver_id"]), local_scheduler_id, True,
                    self.redis)
                log.info("Actor {} for driver {} was on dead local scheduler "
                         "{}. It is being recreated on local scheduler {}"
                         .format(actor_id, info["driver_id"],
                                 info["local_scheduler_id"],
                                 binary_to_hex(local_scheduler_id)))
                # Update the actor info in Redis.
                self.redis.hset(b"Actor:" + hex_to_binary(actor_id),
                                "local_scheduler_id", local_scheduler_id)
Example #28
0
    def __setstate__(self, state):
        state["resources"] = json_to_resources(state["resources"])

        if state["status"] == Trial.RUNNING:
            state["status"] = Trial.PENDING
        for key in self._nonjson_fields:
            state[key] = cloudpickle.loads(hex_to_binary(state[key]))

        self.__dict__.update(state)
        validate_trainable(self.trainable_name)

        # Avoid creating logdir in client mode for returned trial results,
        # since the dir might not be creatable locally. TODO(ekl) thsi is kind
        # of a hack.
        if not ray.util.client.ray.is_connected():
            self.init_logdir()  # Create logdir if it does not exist
Example #29
0
    def placement_group_table(self, placement_group_id=None):
        self._check_connected()

        if placement_group_id is not None:
            placement_group_id = ray.PlacementGroupID(
                hex_to_binary(placement_group_id.hex()))
            placement_group_info = (
                self.global_state_accessor.get_placement_group_info(
                    placement_group_id))
            if placement_group_info is None:
                return {}
            else:
                placement_group_info = (gcs_utils.PlacementGroupTableData.
                                        FromString(placement_group_info))
                return self._gen_placement_group_info(placement_group_info)
        else:
            raise NotImplementedError(
                "Get all placement group is not implemented yet.")
Example #30
0
def get_placement_group(placement_group_name: str):
    """Get a placement group object with a global name.

    Returns:
        None if can't find a placement group with the given name.
        The placement group object otherwise.
    """
    if not placement_group_name:
        raise ValueError(
            "Please supply a non-empty value to get_placement_group")
    worker = ray.worker.global_worker
    worker.check_connected()
    placement_group_info = ray.state.state.get_placement_group_by_name(
        placement_group_name)
    if placement_group_info is None:
        raise ValueError(
            f"Failed to look up actor with name: {placement_group_name}")
    else:
        return PlacementGroup(
            PlacementGroupID(
                hex_to_binary(placement_group_info["placement_group_id"])))
Example #31
0
    def _job_table(self, job_id):
        """Fetch and parse the job table information for a single job ID.

        Args:
            job_id: A job ID or hex string to get information about.

        Returns:
            A dictionary with information about the job ID in question.
        """
        # Allow the argument to be either a JobID or a hex string.
        if not isinstance(job_id, ray.JobID):
            assert isinstance(job_id, str)
            job_id = ray.JobID(hex_to_binary(job_id))

        # Return information about a single job ID.
        message = self.redis_client.execute_command(
            "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("JOB"), "",
            job_id.binary())

        if message is None:
            return {}

        gcs_entry = gcs_utils.GcsEntry.FromString(message)

        assert len(gcs_entry.entries) > 0

        job_info = {}

        for i in range(len(gcs_entry.entries)):
            entry = gcs_utils.JobTableData.FromString(gcs_entry.entries[i])
            assert entry.job_id == job_id.binary()
            job_info["JobID"] = job_id.hex()
            job_info["NodeManagerAddress"] = entry.node_manager_address
            job_info["DriverPid"] = entry.driver_pid
            if entry.is_dead:
                job_info["StopTime"] = entry.timestamp
            else:
                job_info["StartTime"] = entry.timestamp

        return job_info
Example #32
0
    def task_table(self, task_id=None):
        """Fetch and parse the task table information for one or more task IDs.

    Args:
      task_id: A hex string of the task ID to fetch information about. If this
        is None, then the task object table is fetched.


    Returns:
      Information from the task table.
    """
        self._check_connected()
        if task_id is not None:
            return self._task_table(hex_to_binary(task_id))
        else:
            task_table_keys = self.redis_client.keys(TASK_PREFIX + "*")
            results = {}
            for key in task_table_keys:
                task_id_binary = key[len(TASK_PREFIX):]
                results[binary_to_hex(task_id_binary)] = self._task_table(
                    task_id_binary)
            return results
Example #33
0
    def _object_table(self, object_id):
        """Fetch and parse the object table information for a single object ID.

        Args:
            object_id_binary: A string of bytes with the object ID to get
                information about.

        Returns:
            A dictionary with information about the object ID in question.
        """
        # Allow the argument to be either an ObjectID or a hex string.
        if not isinstance(object_id, ray.local_scheduler.ObjectID):
            object_id = ray.local_scheduler.ObjectID(hex_to_binary(object_id))

        # Return information about a single object ID.
        object_locations = self._execute_command(object_id,
                                                 "RAY.OBJECT_TABLE_LOOKUP",
                                                 object_id.id())
        if object_locations is not None:
            manager_ids = [
                binary_to_hex(manager_id) for manager_id in object_locations
            ]
        else:
            manager_ids = None

        result_table_response = self._execute_command(
            object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id())
        result_table_message = ResultTableReply.GetRootAsResultTableReply(
            result_table_response, 0)

        result = {
            "ManagerIDs": manager_ids,
            "TaskID": binary_to_hex(result_table_message.TaskId()),
            "IsPut": bool(result_table_message.IsPut()),
            "DataSize": result_table_message.DataSize(),
            "Hash": binary_to_hex(result_table_message.Hash())
        }

        return result
Example #34
0
    def node_resource_table(self, node_id=None):
        """Fetch and parse the node resource table info for one.

        Args:
            node_id: An node ID to fetch information about.

        Returns:
            Information from the node resource table.
        """
        self._check_connected()

        node_id = ray.NodeID(hex_to_binary(node_id))
        node_resource_bytes = \
            self.global_state_accessor.get_node_resource_info(node_id)
        if node_resource_bytes is None:
            return {}
        else:
            node_resource_info = gcs_utils.ResourceMap.FromString(
                node_resource_bytes)
            return {
                key: value.resource_capacity
                for key, value in node_resource_info.items.items()
            }
Example #35
0
    def task_table(self, task_id=None):
        """Fetch and parse the task table information for one or more task IDs.

        Args:
            task_id: A hex string of the task ID to fetch information about. If
                this is None, then the task object table is fetched.


        Returns:
            Information from the task table.
        """
        self._check_connected()
        if task_id is not None:
            task_id = ray.local_scheduler.ObjectID(hex_to_binary(task_id))
            return self._task_table(task_id)
        else:
            task_table_keys = self._keys(TASK_PREFIX + "*")
            results = {}
            for key in task_table_keys:
                task_id_binary = key[len(TASK_PREFIX):]
                results[binary_to_hex(task_id_binary)] = self._task_table(
                    ray.local_scheduler.ObjectID(task_id_binary))
            return results
Example #36
0
    def _object_table(self, object_id):
        """Fetch and parse the object table information for a single object ID.

        Args:
            object_id_binary: A string of bytes with the object ID to get
                information about.

        Returns:
            A dictionary with information about the object ID in question.
        """
        # Allow the argument to be either an ObjectID or a hex string.
        if not isinstance(object_id, ray.local_scheduler.ObjectID):
            object_id = ray.local_scheduler.ObjectID(hex_to_binary(object_id))

        # Return information about a single object ID.
        object_locations = self._execute_command(object_id,
                                                 "RAY.OBJECT_TABLE_LOOKUP",
                                                 object_id.id())
        if object_locations is not None:
            manager_ids = [binary_to_hex(manager_id)
                           for manager_id in object_locations]
        else:
            manager_ids = None

        result_table_response = self._execute_command(
            object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id())
        result_table_message = ResultTableReply.GetRootAsResultTableReply(
            result_table_response, 0)

        result = {"ManagerIDs": manager_ids,
                  "TaskID": binary_to_hex(result_table_message.TaskId()),
                  "IsPut": bool(result_table_message.IsPut()),
                  "DataSize": result_table_message.DataSize(),
                  "Hash": binary_to_hex(result_table_message.Hash())}

        return result
Example #37
0
 def _from_cloudpickle(self, obj):
     return cloudpickle.loads(hex_to_binary(obj["value"]))
Example #38
0
 def _load_trial_info(self, trial_info):
     trial_info["config"] = cloudpickle.loads(
         hex_to_binary(trial_info["config"]))
     trial_info["result"] = cloudpickle.loads(
         hex_to_binary(trial_info["result"]))
Example #39
0
 def _load_trial_info(self, trial_info):
     trial_info["config"] = cloudpickle.loads(
         hex_to_binary(trial_info["config"]))
     trial_info["result"] = cloudpickle.loads(
         hex_to_binary(trial_info["result"]))
Example #40
0
File: actor.py Project: the-sea/ray
 def hex_to_object_id(hex_id):
     return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))
Example #41
0
 def object_hook(self, obj):
     if obj.get("_type") == "function":
         return cloudpickle.loads(hex_to_binary(obj["value"]))
     return obj