Esempio n. 1
0
File: job_head.py Progetto: rlan/ray
    async def submit_job(self, req) -> aiohttp.web.Response:
        job_description_data = dict(await req.json())
        # Validate the job description data.
        try:
            JobDescription(**job_description_data)
        except Exception as ex:
            return dashboard_utils.rest_response(
                success=False, message=f"Failed to submit job: {ex}")

        # TODO(fyrestone): Choose a random agent to start the driver
        # for this job.
        node_id, ports = next(iter(DataSource.agents.items()))
        ip = DataSource.node_id_to_ip[node_id]
        address = f"{ip}:{ports[1]}"
        options = (("grpc.enable_http_proxy", 0), )
        channel = aiogrpc.insecure_channel(address, options=options)
        stub = job_agent_pb2_grpc.JobAgentServiceStub(channel)
        request = job_agent_pb2.InitializeJobEnvRequest(
            job_description=json.dumps(job_description_data))
        # TODO(fyrestone): It's better not to wait the RPC InitializeJobEnv.
        reply = await stub.InitializeJobEnv(request)
        # TODO(fyrestone): We should reply a job id for the submitted job.
        if reply.status == agent_manager_pb2.AGENT_RPC_STATUS_OK:
            logger.info("Succeeded to submit job.")
            return dashboard_utils.rest_response(
                success=True, message="Job submitted.")
        else:
            logger.info("Failed to submit job.")
            return dashboard_utils.rest_response(
                success=False,
                message=f"Failed to submit job: {reply.error_message}")
Esempio n. 2
0
    async def kill_actor(self, req) -> aiohttp.web.Response:
        try:
            actor_id = req.query["actorId"]
            ip_address = req.query["ipAddress"]
            port = req.query["port"]
        except KeyError:
            return rest_response(success=False, message="Bad Request")
        try:
            options = (("grpc.enable_http_proxy", 0), )
            channel = aiogrpc.insecure_channel(f"{ip_address}:{port}",
                                               options=options)
            stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel)

            await stub.KillActor(
                core_worker_pb2.KillActorRequest(
                    intended_actor_id=ray._private.utils.hex_to_binary(
                        actor_id)))

        except aiogrpc.AioRpcError:
            # This always throws an exception because the worker
            # is killed and the channel is closed on the worker side
            # before this handler, however it deletes the actor correctly.
            pass

        return rest_response(success=True,
                             message=f"Killed actor with id {actor_id}")
Esempio n. 3
0
 async def get_all_nodes(self, req) -> aiohttp.web.Response:
     view = req.query.get("view")
     if view == "summary":
         all_node_summary = await DataOrganizer.get_all_node_summary()
         return dashboard_utils.rest_response(
             success=True,
             message="Node summary fetched.",
             summary=all_node_summary)
     elif view == "details":
         all_node_details = await DataOrganizer.get_all_node_details()
         return dashboard_utils.rest_response(
             success=True,
             message="All node details fetched",
             clients=all_node_details,
         )
     elif view is not None and view.lower() == "hostNameList".lower():
         alive_hostnames = set()
         for node in DataSource.nodes.values():
             if node["state"] == "ALIVE":
                 alive_hostnames.add(node["nodeManagerHostname"])
         return dashboard_utils.rest_response(
             success=True,
             message="Node hostname list fetched.",
             host_name_list=list(alive_hostnames))
     else:
         return dashboard_utils.rest_response(
             success=False, message=f"Unknown view {view}")
Esempio n. 4
0
 async def set_tune_experiment(self, req) -> aiohttp.web.Response:
     experiment = req.query["experiment"]
     err, experiment = self.set_experiment(experiment)
     if err:
         return rest_response(success=False, error=err)
     return rest_response(success=True,
                          message="Successfully set experiment",
                          **experiment)
Esempio n. 5
0
File: job_head.py Progetto: rlan/ray
 async def get_all_jobs(self, req) -> aiohttp.web.Response:
     view = req.query.get("view")
     if view == "summary":
         return dashboard_utils.rest_response(
             success=True,
             message="All job summary fetched.",
             summary=list(DataSource.jobs.values()))
     else:
         return dashboard_utils.rest_response(
             success=False, message="Unknown view {}".format(view))
Esempio n. 6
0
 async def set_fetch_memory_info(self, req) -> aiohttp.web.Response:
     should_fetch = req.query["shouldFetch"]
     if should_fetch == "true":
         self._collect_memory_info = True
     elif should_fetch == "false":
         self._collect_memory_info = False
     else:
         return dashboard_utils.rest_response(
             success=False,
             message=f"Unknown argument to set_fetch {should_fetch}")
     return dashboard_utils.rest_response(
         success=True,
         message=f"Successfully set fetching to {should_fetch}")
Esempio n. 7
0
File: job_head.py Progetto: rlan/ray
 async def get_job(self, req) -> aiohttp.web.Response:
     job_id = req.match_info.get("job_id")
     view = req.query.get("view")
     if view is None:
         job_detail = {
             "jobInfo": DataSource.jobs.get(job_id, {}),
             "jobActors": DataSource.job_actors.get(job_id, {}),
             "jobWorkers": DataSource.job_workers.get(job_id, []),
         }
         await GlobalSignals.job_info_fetched.send(job_detail)
         return dashboard_utils.rest_response(
             success=True, message="Job detail fetched.", detail=job_detail)
     else:
         return dashboard_utils.rest_response(
             success=False, message="Unknown view {}".format(view))
Esempio n. 8
0
    async def get_cluster_status(self, req):
        """Returns status information about the cluster.

        Currently contains two fields:
            autoscaling_status (str): a status message from the autoscaler.
            autoscaling_error (str): an error message from the autoscaler if
                anything has gone wrong during autoscaling.

        These fields are both read from the GCS, it's expected that the
        autoscaler writes them there.
        """

        aioredis_client = self._dashboard_head.aioredis_client
        legacy_status = await aioredis_client.hget(
            DEBUG_AUTOSCALING_STATUS_LEGACY, "value")
        formatted_status_string = await aioredis_client.hget(
            DEBUG_AUTOSCALING_STATUS, "value")
        formatted_status = json.loads(formatted_status_string.decode()
                                      ) if formatted_status_string else {}
        error = await aioredis_client.hget(DEBUG_AUTOSCALING_ERROR, "value")
        return dashboard_utils.rest_response(
            success=True,
            message="Got cluster status.",
            autoscaling_status=legacy_status.decode()
            if legacy_status else None,
            autoscaling_error=error.decode() if error else None,
            cluster_status=formatted_status if formatted_status else None,
        )
Esempio n. 9
0
    async def get_cluster_status(self, req):
        """Returns status information about the cluster.

        Currently contains two fields:
            autoscaling_status (str): a status message from the autoscaler.
            autoscaling_error (str): an error message from the autoscaler if
                anything has gone wrong during autoscaling.

        These fields are both read from the GCS, it's expected that the
        autoscaler writes them there.
        """

        assert ray.experimental.internal_kv._internal_kv_initialized()
        legacy_status = internal_kv._internal_kv_get(
            DEBUG_AUTOSCALING_STATUS_LEGACY)
        formatted_status_string = internal_kv._internal_kv_get(
            DEBUG_AUTOSCALING_STATUS)
        formatted_status = json.loads(formatted_status_string.decode()
                                      ) if formatted_status_string else {}
        error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
        return dashboard_utils.rest_response(
            success=True,
            message="Got cluster status.",
            autoscaling_status=legacy_status.decode()
            if legacy_status else None,
            autoscaling_error=error.decode() if error else None,
            cluster_status=formatted_status if formatted_status else None,
        )
Esempio n. 10
0
    async def get_event(self, req) -> aiohttp.web.Response:
        job_id = req.query.get("job_id")
        if job_id is None:
            all_events = {
                job_id: list(job_events.values())
                for job_id, job_events in DataSource.events.items()
            }
            return dashboard_utils.rest_response(
                success=True, message="All events fetched.", events=all_events)

        job_events = DataSource.events.get(job_id, {})
        return dashboard_utils.rest_response(
            success=True,
            message="Job events fetched.",
            job_id=job_id,
            events=list(job_events.values()))
Esempio n. 11
0
 async def get_availability(self, req) -> aiohttp.web.Response:
     availability = {
         "available": ExperimentAnalysis is not None,
         "trials_available": self._trials_available
     }
     return rest_response(success=True,
                          message="Fetched tune availability",
                          result=availability)
Esempio n. 12
0
 async def get_errors(self, req) -> aiohttp.web.Response:
     ip = req.query["ip"]
     pid = str(req.query.get("pid", ""))
     node_errors = DataSource.ip_and_pid_to_errors.get(ip, {})
     if pid:
         node_errors = {str(pid): node_errors.get(pid, [])}
     return dashboard_utils.rest_response(
         success=True, message="Fetched errors.", errors=node_errors)
Esempio n. 13
0
 async def dump(self, req) -> aiohttp.web.Response:
     key = req.query.get("key")
     if key is None:
         all_data = {
             k: dict(v)
             for k, v in DataSource.__dict__.items()
             if not k.startswith("_")
         }
         return dashboard_utils.rest_response(
             success=True,
             message="Fetch all data from datacenter success.",
             **all_data)
     else:
         data = dict(DataSource.__dict__.get(key))
         return dashboard_utils.rest_response(
             success=True,
             message=f"Fetch {key} from datacenter success.",
             **{key: data})
Esempio n. 14
0
    async def kill_actor_gcs(self, req) -> aiohttp.web.Response:
        actor_id = req.query.get("actor_id")
        force_kill = req.query.get("force_kill", False) in ("true", "True")
        no_restart = req.query.get("no_restart", False) in ("true", "True")
        if not actor_id:
            return dashboard_utils.rest_response(
                success=False, message="actor_id is required.")

        request = gcs_service_pb2.KillActorViaGcsRequest()
        request.actor_id = bytes.fromhex(actor_id)
        request.force_kill = force_kill
        request.no_restart = no_restart
        await self._gcs_actor_info_stub.KillActorViaGcs(request, timeout=5)

        message = (f"Force killed actor with id {actor_id}" if force_kill else
                   f"Requested actor with id {actor_id} to terminate. " +
                   "It will exit once running tasks complete")

        return dashboard_utils.rest_response(success=True, message=message)
Esempio n. 15
0
 async def get_actor_groups(self, req) -> aiohttp.web.Response:
     actors = await DataOrganizer.get_all_actors()
     actor_creation_tasks = await DataOrganizer.get_actor_creation_tasks()
     # actor_creation_tasks have some common interface with actors,
     # and they get processed and shown in tandem in the logical view
     # hence we merge them together before constructing actor groups.
     actors.update(actor_creation_tasks)
     actor_groups = actor_utils.construct_actor_groups(actors)
     return rest_response(success=True,
                          message="Fetched actor groups.",
                          actor_groups=actor_groups)
Esempio n. 16
0
 async def launch_profiling(self, req) -> aiohttp.web.Response:
     ip = req.query["ip"]
     pid = int(req.query["pid"])
     duration = int(req.query["duration"])
     reporter_stub = self._stubs[ip]
     reply = await reporter_stub.GetProfilingStats(
         reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
     profiling_info = (json.loads(reply.profiling_stats)
                       if reply.profiling_stats else reply.std_out)
     return dashboard_utils.rest_response(success=True,
                                          message="Profiling success.",
                                          profiling_info=profiling_info)
Esempio n. 17
0
    async def get_ray_config(self, req) -> aiohttp.web.Response:
        if self._ray_config is None:
            try:
                config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
                with open(config_path) as f:
                    cfg = yaml.safe_load(f)
            except yaml.YAMLError:
                return dashboard_utils.rest_response(
                    success=False,
                    message=f"No config found at {config_path}.",
                )
            except FileNotFoundError:
                return dashboard_utils.rest_response(
                    success=False,
                    message="Invalid config, could not load YAML.")

            payload = {
                "min_workers": cfg.get("min_workers", "unspecified"),
                "max_workers": cfg.get("max_workers", "unspecified")
            }

            try:
                payload["head_type"] = cfg["head_node"]["InstanceType"]
            except KeyError:
                payload["head_type"] = "unknown"

            try:
                payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
            except KeyError:
                payload["worker_type"] = "unknown"

            self._ray_config = payload

        return dashboard_utils.rest_response(
            success=True,
            message="Fetched ray config.",
            **self._ray_config,
        )
Esempio n. 18
0
    async def get_memory_table(self, req) -> aiohttp.web.Response:
        group_by = req.query.get("group_by")
        sort_by = req.query.get("sort_by")
        kwargs = {}
        if group_by:
            kwargs["group_by"] = GroupByType(group_by)
        if sort_by:
            kwargs["sort_by"] = SortingType(sort_by)

        memory_table = await DataOrganizer.get_memory_table(**kwargs)
        return dashboard_utils.rest_response(
            success=True,
            message="Fetched memory table",
            memory_table=memory_table.as_dict())
Esempio n. 19
0
 async def snapshot(self, req):
     job_data = await self.get_job_info()
     actor_data = await self.get_actor_info()
     serve_data = await self.get_serve_info()
     session_name = await self.get_session_name()
     snapshot = {
         "jobs": job_data,
         "actors": actor_data,
         "deployments": serve_data,
         "session_name": session_name,
         "ray_version": ray.__version__,
         "ray_commit": ray.__commit__
     }
     return dashboard_utils.rest_response(
         success=True, message="hello", snapshot=snapshot)
Esempio n. 20
0
def test_immutable_types():
    d = {str(i): i for i in range(1000)}
    d["list"] = list(range(1000))
    d["list"][0] = {str(i): i for i in range(1000)}
    d["dict"] = {str(i): i for i in range(1000)}
    immutable_dict = dashboard_utils.make_immutable(d)
    assert type(immutable_dict) == dashboard_utils.ImmutableDict
    assert immutable_dict == dashboard_utils.ImmutableDict(d)
    assert immutable_dict == d
    assert dashboard_utils.ImmutableDict(immutable_dict) == immutable_dict
    assert dashboard_utils.ImmutableList(
        immutable_dict["list"]) == immutable_dict["list"]
    assert "512" in d
    assert "512" in d["list"][0]
    assert "512" in d["dict"]

    # Test type conversion
    assert type(dict(immutable_dict)["list"]) == dashboard_utils.ImmutableList
    assert type(list(
        immutable_dict["list"])[0]) == dashboard_utils.ImmutableDict

    # Test json dumps / loads
    json_str = json.dumps(immutable_dict, cls=dashboard_utils.CustomEncoder)
    deserialized_immutable_dict = json.loads(json_str)
    assert type(deserialized_immutable_dict) == dict
    assert type(deserialized_immutable_dict["list"]) == list
    assert immutable_dict.mutable() == deserialized_immutable_dict
    dashboard_utils.rest_response(True, "OK", data=immutable_dict)
    dashboard_utils.rest_response(True, "OK", **immutable_dict)

    # Test copy
    copy_of_immutable = copy.copy(immutable_dict)
    assert copy_of_immutable == immutable_dict
    deepcopy_of_immutable = copy.deepcopy(immutable_dict)
    assert deepcopy_of_immutable == immutable_dict

    # Test get default immutable
    immutable_default_value = immutable_dict.get("not exist list", [1, 2])
    assert type(immutable_default_value) == dashboard_utils.ImmutableList

    # Test recursive immutable
    assert type(immutable_dict["list"]) == dashboard_utils.ImmutableList
    assert type(immutable_dict["dict"]) == dashboard_utils.ImmutableDict
    assert type(immutable_dict["list"][0]) == dashboard_utils.ImmutableDict

    # Test exception
    with pytest.raises(TypeError):
        dashboard_utils.ImmutableList((1, 2))

    with pytest.raises(TypeError):
        dashboard_utils.ImmutableDict([1, 2])

    with pytest.raises(TypeError):
        immutable_dict["list"] = []

    with pytest.raises(AttributeError):
        immutable_dict.update({1: 3})

    with pytest.raises(TypeError):
        immutable_dict["list"][0] = 0

    with pytest.raises(AttributeError):
        immutable_dict["list"].extend([1, 2])

    with pytest.raises(AttributeError):
        immutable_dict["list"].insert(1, 2)

    d2 = dashboard_utils.ImmutableDict({1: np.zeros([3, 5])})
    with pytest.raises(TypeError):
        print(d2[1])

    d3 = dashboard_utils.ImmutableList([1, np.zeros([3, 5])])
    with pytest.raises(TypeError):
        print(d3[1])
Esempio n. 21
0
 async def get_notified_agents(self, req) -> aiohttp.web.Response:
     return dashboard_utils.rest_response(
         success=True,
         message="Fetch notified agents success.",
         **self._notified_agents)
Esempio n. 22
0
 async def get_all_actors(self, req) -> aiohttp.web.Response:
     return dashboard_utils.rest_response(success=True,
                                          message="All actors fetched.",
                                          actors=DataSource.actors)
Esempio n. 23
0
 async def test_aiohttp_cache_lru(self, req) -> aiohttp.web.Response:
     value = req.query.get("value")
     return dashboard_utils.rest_response(success=True,
                                          message="OK",
                                          value=value,
                                          timestamp=time.time())
Esempio n. 24
0
 async def tune_info(self, req) -> aiohttp.web.Response:
     stats = self.get_stats()
     return rest_response(success=True,
                          message="Fetched tune info",
                          result=stats)
Esempio n. 25
0
 async def get_node(self, req) -> aiohttp.web.Response:
     node_id = req.match_info.get("node_id")
     node_info = await DataOrganizer.get_node_info(node_id)
     return dashboard_utils.rest_response(success=True,
                                          message="Node details fetched.",
                                          detail=node_info)
Esempio n. 26
0
 async def enable_tensorboard(self, req) -> aiohttp.web.Response:
     self._enable_tensorboard()
     if not self._tensor_board_dir:
         return rest_response(success=False,
                              message="Error enabling tensorboard")
     return rest_response(success=True, message="Enabled tensorboard")