Beispiel #1
0
 def default(self, obj):
     if isinstance(obj, bytes):
         return binary_to_hex(obj)
     if isinstance(obj, Immutable):
         return obj.mutable()
     # Let the base class default method raise the TypeError
     return json.JSONEncoder.default(self, obj)
Beispiel #2
0
def _get_event(msg="empty message", job_id=None, source_type=None):
    return {
        "event_id":
        binary_to_hex(np.random.bytes(18)),
        "source_type":
        random.choice(event_pb2.Event.SourceType.keys())
        if source_type is None else source_type,
        "host_name":
        "po-dev.inc.alipay.net",
        "pid":
        random.randint(1, 65536),
        "label":
        "",
        "message":
        msg,
        "time_stamp":
        time.time(),
        "severity":
        "INFO",
        "custom_fields": {
            "job_id":
            ray.JobID.from_int(random.randint(1, 100)).hex()
            if job_id is None else job_id,
            "node_id":
            "",
            "task_id":
            "",
        }
    }
Beispiel #3
0
    def workers(self):
        """Get a dictionary mapping worker ID to worker information."""
        self._check_connected()

        # Get all data in worker table
        worker_table = self.global_state_accessor.get_worker_table()
        workers_data = {}
        for i in range(len(worker_table)):
            worker_table_data = gcs_utils.WorkerTableData.FromString(
                worker_table[i])
            if worker_table_data.is_alive and \
                    worker_table_data.worker_type == gcs_utils.WORKER:
                worker_id = binary_to_hex(
                    worker_table_data.worker_address.worker_id)
                worker_info = worker_table_data.worker_info

                workers_data[worker_id] = {
                    "node_ip_address":
                    decode(worker_info[b"node_ip_address"]),
                    "plasma_store_socket":
                    decode(worker_info[b"plasma_store_socket"])
                }
                if b"stderr_file" in worker_info:
                    workers_data[worker_id]["stderr_file"] = decode(
                        worker_info[b"stderr_file"])
                if b"stdout_file" in worker_info:
                    workers_data[worker_id]["stdout_file"] = decode(
                        worker_info[b"stdout_file"])
        return workers_data
Beispiel #4
0
    def placement_group_table(self, placement_group_id=None):
        self._check_connected()

        if placement_group_id is not None:
            placement_group_id = ray.PlacementGroupID(
                hex_to_binary(placement_group_id.hex()))
            placement_group_info = (
                self.global_state_accessor.get_placement_group_info(
                    placement_group_id))
            if placement_group_info is None:
                return {}
            else:
                placement_group_info = (gcs_utils.PlacementGroupTableData.
                                        FromString(placement_group_info))
                return self._gen_placement_group_info(placement_group_info)
        else:
            placement_group_table = self.global_state_accessor.\
                                    get_placement_group_table()
            results = {}
            for placement_group_info in placement_group_table:
                placement_group_table_data = gcs_utils.\
                    PlacementGroupTableData.FromString(placement_group_info)
                placement_group_id = binary_to_hex(
                    placement_group_table_data.placement_group_id)
                results[placement_group_id] = \
                    self._gen_placement_group_info(placement_group_table_data)

            return results
Beispiel #5
0
    def profile_table(self):
        self._check_connected()

        result = defaultdict(list)
        profile_table = self.global_state_accessor.get_profile_table()
        for i in range(len(profile_table)):
            profile = gcs_utils.ProfileTableData.FromString(profile_table[i])

            component_type = profile.component_type
            component_id = binary_to_hex(profile.component_id)
            node_ip_address = profile.node_ip_address

            for event in profile.profile_events:
                try:
                    extra_data = json.loads(event.extra_data)
                except ValueError:
                    extra_data = {}
                profile_event = {
                    "event_type": event.event_type,
                    "component_id": component_id,
                    "node_ip_address": node_ip_address,
                    "component_type": component_type,
                    "start_time": event.start_time,
                    "end_time": event.end_time,
                    "extra_data": extra_data
                }

                result[component_id].append(profile_event)

        return dict(result)
Beispiel #6
0
    def actor_table(self, actor_id):
        """Fetch and parse the actor table information for a single actor ID.

        Args:
            actor_id: A hex string of the actor ID to fetch information about.
                If this is None, then the actor table is fetched.

        Returns:
            Information from the actor table.
        """
        self._check_connected()

        if actor_id is not None:
            actor_id = ray.ActorID(hex_to_binary(actor_id))
            actor_info = self.global_state_accessor.get_actor_info(actor_id)
            if actor_info is None:
                return {}
            else:
                actor_table_data = gcs_utils.ActorTableData.FromString(
                    actor_info)
                return self._gen_actor_info(actor_table_data)
        else:
            actor_table = self.global_state_accessor.get_actor_table()
            results = {}
            for i in range(len(actor_table)):
                actor_table_data = gcs_utils.ActorTableData.FromString(
                    actor_table[i])
                results[binary_to_hex(actor_table_data.actor_id)] = \
                    self._gen_actor_info(actor_table_data)

            return results
Beispiel #7
0
    def object_table(self, object_ref=None):
        """Fetch and parse the object table info for one or more object refs.

        Args:
            object_ref: An object ref to fetch information about. If this is
                None, then the entire object table is fetched.

        Returns:
            Information from the object table.
        """
        self._check_connected()

        if object_ref is not None:
            object_ref = ray.ObjectRef(hex_to_binary(object_ref))
            object_info = self.global_state_accessor.get_object_info(
                object_ref)
            if object_info is None:
                return {}
            else:
                object_location_info = gcs_utils.ObjectLocationInfo.FromString(
                    object_info)
                return self._gen_object_info(object_location_info)
        else:
            object_table = self.global_state_accessor.get_object_table()
            results = {}
            for i in range(len(object_table)):
                object_location_info = gcs_utils.ObjectLocationInfo.FromString(
                    object_table[i])
                results[binary_to_hex(object_location_info.object_id)] = \
                    self._gen_object_info(object_location_info)
            return results
Beispiel #8
0
    def _gen_placement_group_info(self, placement_group_info):
        # This should be imported here, otherwise, it will error doc build.
        from ray.core.generated.common_pb2 import PlacementStrategy

        def get_state(state):
            if state == gcs_utils.PlacementGroupTableData.PENDING:
                return "PENDING"
            elif state == gcs_utils.PlacementGroupTableData.CREATED:
                return "CREATED"
            else:
                return "REMOVED"

        def get_strategy(strategy):
            if strategy == PlacementStrategy.PACK:
                return "PACK"
            elif strategy == PlacementStrategy.STRICT_PACK:
                return "STRICT_PACK"
            elif strategy == PlacementStrategy.STRICT_SPREAD:
                return "STRICT_SPREAD"
            elif strategy == PlacementStrategy.SPREAD:
                return "SPREAD"
            else:
                raise ValueError(
                    f"Invalid strategy returned: {PlacementStrategy}")

        stats = placement_group_info.stats
        assert placement_group_info is not None
        return {
            "placement_group_id":
            binary_to_hex(placement_group_info.placement_group_id),
            "name":
            placement_group_info.name,
            "bundles": {
                # The value here is needs to be dictionarified
                # otherwise, the payload becomes unserializable.
                bundle.bundle_id.bundle_index:
                MessageToDict(bundle)["unitResources"]
                for bundle in placement_group_info.bundles
            },
            "strategy":
            get_strategy(placement_group_info.strategy),
            "state":
            get_state(placement_group_info.state),
            "stats": {
                "end_to_end_creation_latency_ms":
                (stats.end_to_end_creation_latency_us / 1000.0),
                "scheduling_latency_ms":
                (stats.scheduling_latency_us / 1000.0),
                "scheduling_attempt":
                stats.scheduling_attempt,
                "highest_retry_delay_ms":
                stats.highest_retry_delay_ms,
                "scheduling_state":
                gcs_pb2.PlacementGroupStats.SchedulingState.DESCRIPTOR.
                values_by_number[  # noqa: E501
                    stats.scheduling_state].name,
            },
        }
Beispiel #9
0
    def _gen_actor_info(self, actor_table_data):
        """Parse actor table data.

        Returns:
            Information from actor table.
        """
        actor_info = {
            "ActorID":
            binary_to_hex(actor_table_data.actor_id),
            "ActorClassName":
            actor_table_data.class_name,
            "IsDetached":
            actor_table_data.is_detached,
            "Name":
            actor_table_data.name,
            "JobID":
            binary_to_hex(actor_table_data.job_id),
            "Address": {
                "IPAddress": actor_table_data.address.ip_address,
                "Port": actor_table_data.address.port,
                "NodeID": binary_to_hex(actor_table_data.address.raylet_id),
            },
            "OwnerAddress": {
                "IPAddress": actor_table_data.owner_address.ip_address,
                "Port": actor_table_data.owner_address.port,
                "NodeID":
                binary_to_hex(actor_table_data.owner_address.raylet_id),
            },
            "State":
            gcs_pb2.ActorTableData.ActorState.DESCRIPTOR.values_by_number[
                actor_table_data.state].name,
            "NumRestarts":
            actor_table_data.num_restarts,
            "Timestamp":
            actor_table_data.timestamp,
            "StartTime":
            actor_table_data.start_time,
            "EndTime":
            actor_table_data.end_time,
            "DeathCause":
            actor_table_data.death_cause,
            "Pid":
            actor_table_data.pid,
        }
        return actor_info
Beispiel #10
0
 def _decode_keys(d):
     for k, v in d.items():
         if isinstance(v, dict):
             d[k] = _decode_keys(v)
         if isinstance(v, list):
             new_list = []
             for i in v:
                 if isinstance(i, dict):
                     new_list.append(_decode_keys(i))
                 else:
                     new_list.append(i)
             d[k] = new_list
         else:
             if k in decode_keys:
                 d[k] = binary_to_hex(b64decode(v))
             else:
                 d[k] = v
     return d
Beispiel #11
0
    def _gen_placement_group_info(self, placement_group_info):
        # This should be imported here, otherwise, it will error doc build.
        from ray.core.generated.common_pb2 import PlacementStrategy

        def get_state(state):
            if state == ray.gcs_utils.PlacementGroupTableData.PENDING:
                return "PENDING"
            elif state == ray.gcs_utils.PlacementGroupTableData.CREATED:
                return "CREATED"
            else:
                return "REMOVED"

        def get_strategy(strategy):
            if strategy == PlacementStrategy.PACK:
                return "PACK"
            elif strategy == PlacementStrategy.STRICT_PACK:
                return "STRICT_PACK"
            elif strategy == PlacementStrategy.STRICT_SPREAD:
                return "STRICT_SPREAD"
            elif strategy == PlacementStrategy.SPREAD:
                return "SPREAD"
            else:
                raise ValueError(
                    f"Invalid strategy returned: {PlacementStrategy}")

        assert placement_group_info is not None
        return {
            "placement_group_id":
            binary_to_hex(placement_group_info.placement_group_id),
            "name":
            placement_group_info.name,
            "bundles": {
                # The value here is needs to be dictionarified
                # otherwise, the payload becomes unserializable.
                bundle.bundle_id.bundle_index:
                MessageToDict(bundle)["unitResources"]
                for bundle in placement_group_info.bundles
            },
            "strategy":
            get_strategy(placement_group_info.strategy),
            "state":
            get_state(placement_group_info.state),
        }
Beispiel #12
0
    def __getstate__(self):
        """Memento generator for Trial.

        Sets RUNNING trials to PENDING.
        Note this can only occur if the trial holds a PERSISTENT checkpoint.
        """
        state = self.__dict__.copy()

        for key in self._nonjson_fields:
            state[key] = binary_to_hex(cloudpickle.dumps(state.get(key)))

        state["runner"] = None
        state["location"] = Location()
        # Avoid waiting for events that will never occur on resume.
        state["restoring_from"] = None
        state["saving_to"] = None

        state["_state_json"] = None
        state["_state_valid"] = False
        state["_default_result_or_future"] = None

        return copy.deepcopy(state)
Beispiel #13
0
    async def list_tasks(self, *, option: ListApiOptions) -> dict:
        """List all task information from the cluster.

        Returns:
            {task_id -> task_data_in_dict}
            task_data_in_dict's schema is in TaskState
        """
        replies = await asyncio.gather(*[
            self._client.get_task_info(node_id, timeout=option.timeout)
            for node_id in self._client.get_all_registered_raylet_ids()
        ])

        running_task_id = set()
        for reply in replies:
            for task_id in reply.running_task_ids:
                running_task_id.add(binary_to_hex(task_id))

        result = []
        for reply in replies:
            logger.info(reply)
            tasks = reply.owned_task_info_entries
            for task in tasks:
                data = self._message_to_dict(
                    message=task,
                    fields_to_decode=["task_id"],
                )
                if data["task_id"] in running_task_id:
                    data[
                        "scheduling_state"] = TaskStatus.DESCRIPTOR.values_by_number[
                            TaskStatus.RUNNING].name
                data = filter_fields(data, TaskState)
                result.append(data)

        # Sort to make the output deterministic.
        result.sort(key=lambda entry: entry["task_id"])
        return {d["task_id"]: d for d in islice(result, option.limit)}
Beispiel #14
0
    async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse:
        """List all task information from the cluster.

        Returns:
            {task_id -> task_data_in_dict}
            task_data_in_dict's schema is in TaskState
        """
        raylet_ids = self._client.get_all_registered_raylet_ids()
        replies = await asyncio.gather(
            *[
                self._client.get_task_info(node_id, timeout=option.timeout)
                for node_id in raylet_ids
            ],
            return_exceptions=True,
        )

        unresponsive_nodes = 0
        running_task_id = set()
        successful_replies = []
        for reply in replies:
            if isinstance(reply, DataSourceUnavailable):
                unresponsive_nodes += 1
                continue
            elif isinstance(reply, Exception):
                raise reply

            successful_replies.append(reply)
            for task_id in reply.running_task_ids:
                running_task_id.add(binary_to_hex(task_id))

        partial_failure_warning = None
        if len(raylet_ids) > 0 and unresponsive_nodes > 0:
            warning_msg = NODE_QUERY_FAILURE_WARNING.format(
                type="raylet",
                total=len(raylet_ids),
                network_failures=unresponsive_nodes,
                log_command="raylet.out",
            )
            if unresponsive_nodes == len(raylet_ids):
                raise DataSourceUnavailable(warning_msg)
            partial_failure_warning = (
                f"The returned data may contain incomplete result. {warning_msg}"
            )

        result = []
        for reply in successful_replies:
            assert not isinstance(reply, Exception)
            tasks = reply.owned_task_info_entries
            for task in tasks:
                data = self._message_to_dict(
                    message=task,
                    fields_to_decode=["task_id"],
                )
                if data["task_id"] in running_task_id:
                    data["scheduling_state"] = TaskStatus.DESCRIPTOR.values_by_number[
                        TaskStatus.RUNNING
                    ].name
                result.append(data)

        result = self._filter(result, option.filters, TaskState)
        # Sort to make the output deterministic.
        result.sort(key=lambda entry: entry["task_id"])
        return ListApiResponse(
            result=list(islice(result, option.limit)),
            partial_failure_warning=partial_failure_warning,
        )
Beispiel #15
0
 def _to_cloudpickle(self, obj):
     return {
         "_type": "CLOUDPICKLE_FALLBACK",
         "value": binary_to_hex(cloudpickle.dumps(obj)),
     }