def __init__(self, dashboard_agent): """Initialize the reporter object.""" super().__init__(dashboard_agent) self._cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False)) self._ip = ray._private.services.get_node_ip_address() self._hostname = socket.gethostname() self._workers = set() self._network_stats_hist = [(0, (0.0, 0.0))] # time, (sent, recv) self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port) self._key = f"{reporter_consts.REPORTER_PREFIX}" \ f"{self._dashboard_agent.node_id}" # A list of gauges to record and export metrics. self._gauges = { "node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_mem": Gauge("node_mem", "Total memory usage on a ray node", "mb", ["ip"]), "raylet_cpu": Gauge("raylet_cpu", "CPU usage of the raylet on a node.", "percentage", ["ip", "pid"]), "raylet_mem": Gauge("raylet_mem", "Memory usage of the raylet on a node", "mb", ["ip", "pid"]) }
def test_gauge(): tags = [tag_key_module.TagKey(str(i)) for i in range(10)] name = "name" description = "description" units = "units" gauge = Gauge(name, description, units, tags) assert gauge.__dict__()["name"] == name assert gauge.__dict__()["description"] == description assert gauge.__dict__()["units"] == units assert gauge.__dict__()["tags"] == tags
def __init__(self, dashboard_agent): """Initialize the reporter object.""" super().__init__(dashboard_agent) self._cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False)) self._ip = ray._private.services.get_node_ip_address() self._hostname = socket.gethostname() self._workers = set() self._network_stats_hist = [(0, (0.0, 0.0))] # time, (sent, recv) self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port) self._key = f"{reporter_consts.REPORTER_PREFIX}" \ f"{self._dashboard_agent.node_id}" # A list of gauges to record and export metrics. self._gauges = { "node_cpu_utilization": Gauge("node_cpu_utilization", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_cpu_count": Gauge("node_cpu_count", "Total CPUs available on a ray node", "cores", ["ip"]), "node_mem_used": Gauge("node_mem_used", "Memory usage on a ray node", "bytes", ["ip"]), "node_mem_available": Gauge("node_mem_available", "Memory available on a ray node", "bytes", ["ip"]), "node_mem_total": Gauge("node_mem_total", "Total memory on a ray node", "bytes", ["ip"]), "node_gpus_available": Gauge("node_gpus_available", "Total GPUs available on a ray node", "percentage", ["ip"]), "node_gpus_utilization": Gauge("node_gpus_utilization", "Total GPUs usage on a ray node", "percentage", ["ip"]), "node_gram_used": Gauge("node_gram_used", "Total GPU RAM usage on a ray node", "bytes", ["ip"]), "node_gram_available": Gauge("node_gram_available", "Total GPU RAM available on a ray node", "bytes", ["ip"]), "node_disk_usage": Gauge("node_disk_usage", "Total disk usage (bytes) on a ray node", "bytes", ["ip"]), "node_disk_utilization_percentage": Gauge("node_disk_utilization_percentage", "Total disk utilization (percentage) on a ray node", "percentage", ["ip"]), "node_network_sent": Gauge("node_network_sent", "Total network sent", "bytes", ["ip"]), "node_network_received": Gauge("node_network_received", "Total network received", "bytes", ["ip"]), "node_network_send_speed": Gauge("node_network_send_speed", "Network send speed", "bytes/sec", ["ip"]), "node_network_receive_speed": Gauge("node_network_receive_speed", "Network receive speed", "bytes/sec", ["ip"]), "raylet_cpu": Gauge("raylet_cpu", "CPU usage of the raylet on a node.", "percentage", ["ip", "pid"]), "raylet_mem": Gauge("raylet_mem", "Memory usage of the raylet on a node", "mb", ["ip", "pid"]) }
if isinstance(o, dict): D = {k: recursive_asdict(v) for k, v in o.items()} return D return o def jsonify_asdict(o): return json.dumps(dashboard_utils.to_google_style(recursive_asdict(o))) # A list of gauges to record and export metrics. METRICS_GAUGES = { "node_cpu_utilization": Gauge("node_cpu_utilization", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_cpu_count": Gauge("node_cpu_count", "Total CPUs available on a ray node", "cores", ["ip"]), "node_mem_used": Gauge("node_mem_used", "Memory usage on a ray node", "bytes", ["ip"]), "node_mem_available": Gauge("node_mem_available", "Memory available on a ray node", "bytes", ["ip"]), "node_mem_total": Gauge("node_mem_total", "Total memory on a ray node", "bytes", ["ip"]), "node_gpus_available": Gauge("node_gpus_available", "Total GPUs available on a ray node", "percentage", ["ip"]), "node_gpus_utilization": Gauge("node_gpus_utilization", "Total GPUs usage on a ray node",