def _record_stats(stats): records_reported = [] ip = stats["ip"] # -- CPU per node -- cpu_usage = float(stats["cpu"]) cpu_record = Record( gauge=METRICS_GAUGES["node_cpu_utilization"], value=cpu_usage, tags={"ip": ip}) cpu_count, _ = stats["cpus"] cpu_count_record = Record( gauge=METRICS_GAUGES["node_cpu_count"], value=cpu_count, tags={"ip": ip}) # -- Mem per node -- mem_total, mem_available, _, mem_used = stats["mem"] mem_used_record = Record( gauge=METRICS_GAUGES["node_mem_used"], value=mem_used, tags={"ip": ip}) mem_available_record = Record( gauge=METRICS_GAUGES["node_mem_available"], value=mem_available, tags={"ip": ip}) mem_total_record = Record( gauge=METRICS_GAUGES["node_mem_total"], value=mem_total, tags={"ip": ip}) # -- GPU per node -- gpus = stats["gpus"] gpus_available = len(gpus) if gpus_available: gpus_utilization, gram_used, gram_total = 0, 0, 0 for gpu in gpus: gpus_utilization += gpu["utilization_gpu"] gram_used += gpu["memory_used"] gram_total += gpu["memory_total"] gram_available = gram_total - gram_used gpus_available_record = Record( gauge=METRICS_GAUGES["node_gpus_available"], value=gpus_available, tags={"ip": ip}) gpus_utilization_record = Record( gauge=METRICS_GAUGES["node_gpus_utilization"], value=gpus_utilization, tags={"ip": ip}) gram_used_record = Record( gauge=METRICS_GAUGES["node_gram_used"], value=gram_used, tags={"ip": ip}) gram_available_record = Record( gauge=METRICS_GAUGES["node_gram_available"], value=gram_available, tags={"ip": ip}) records_reported.extend([ gpus_available_record, gpus_utilization_record, gram_used_record, gram_available_record ]) # -- Disk per node -- used, free = 0, 0 for entry in stats["disk"].values(): used += entry.used free += entry.free disk_utilization = float(used / (used + free)) * 100 disk_usage_record = Record( gauge=METRICS_GAUGES["node_disk_usage"], value=used, tags={"ip": ip}) disk_utilization_percentage_record = Record( gauge=METRICS_GAUGES["node_disk_utilization_percentage"], value=disk_utilization, tags={"ip": ip}) # -- Network speed (send/receive) stats per node -- network_stats = stats["network"] network_sent_record = Record( gauge=METRICS_GAUGES["node_network_sent"], value=network_stats[0], tags={"ip": ip}) network_received_record = Record( gauge=METRICS_GAUGES["node_network_received"], value=network_stats[1], tags={"ip": ip}) # -- Network speed (send/receive) per node -- network_speed_stats = stats["network_speed"] network_send_speed_record = Record( gauge=METRICS_GAUGES["node_network_send_speed"], value=network_speed_stats[0], tags={"ip": ip}) network_receive_speed_record = Record( gauge=METRICS_GAUGES["node_network_receive_speed"], value=network_speed_stats[1], tags={"ip": ip}) raylet_stats = stats["raylet"] if raylet_stats: raylet_pid = str(raylet_stats["pid"]) # -- raylet CPU -- raylet_cpu_usage = float(raylet_stats["cpu_percent"]) * 100 raylet_cpu_record = Record( gauge=METRICS_GAUGES["raylet_cpu"], value=raylet_cpu_usage, tags={ "ip": ip, "pid": raylet_pid }) # -- raylet mem -- raylet_mem_usage = float(raylet_stats["memory_info"].rss) / 1e6 raylet_mem_record = Record( gauge=METRICS_GAUGES["raylet_mem"], value=raylet_mem_usage, tags={ "ip": ip, "pid": raylet_pid }) records_reported.extend([raylet_cpu_record, raylet_mem_record]) records_reported.extend([ cpu_record, cpu_count_record, mem_used_record, mem_available_record, mem_total_record, disk_usage_record, disk_utilization_percentage_record, network_sent_record, network_received_record, network_send_speed_record, network_receive_speed_record ]) return records_reported
def _record_stats(self, stats, cluster_stats): records_reported = [] ip = stats["ip"] # -- Instance count of cluster -- # Only report cluster stats on head node if "autoscaler_report" in cluster_stats and self._is_head_node: active_nodes = cluster_stats["autoscaler_report"]["active_nodes"] for node_type, active_node_count in active_nodes.items(): records_reported.append( Record(gauge=METRICS_GAUGES["cluster_active_nodes"], value=active_node_count, tags={"node_type": node_type})) failed_nodes = cluster_stats["autoscaler_report"]["failed_nodes"] failed_nodes_dict = {} for node_ip, node_type in failed_nodes: if node_type in failed_nodes_dict: failed_nodes_dict[node_type] += 1 else: failed_nodes_dict[node_type] = 1 for node_type, failed_node_count in failed_nodes_dict.items(): records_reported.append( Record(gauge=METRICS_GAUGES["cluster_failed_nodes"], value=failed_node_count, tags={"node_type": node_type})) pending_nodes = cluster_stats["autoscaler_report"]["pending_nodes"] pending_nodes_dict = {} for node_ip, node_type, status_message in pending_nodes: if node_type in pending_nodes_dict: pending_nodes_dict[node_type] += 1 else: pending_nodes_dict[node_type] = 1 for node_type, pending_node_count in pending_nodes_dict.items(): records_reported.append( Record(gauge=METRICS_GAUGES["cluster_pending_nodes"], value=pending_node_count, tags={"node_type": node_type})) # -- CPU per node -- cpu_usage = float(stats["cpu"]) cpu_record = Record(gauge=METRICS_GAUGES["node_cpu_utilization"], value=cpu_usage, tags={"ip": ip}) cpu_count, _ = stats["cpus"] cpu_count_record = Record(gauge=METRICS_GAUGES["node_cpu_count"], value=cpu_count, tags={"ip": ip}) # -- Mem per node -- mem_total, mem_available, _, mem_used = stats["mem"] mem_used_record = Record(gauge=METRICS_GAUGES["node_mem_used"], value=mem_used, tags={"ip": ip}) mem_available_record = Record( gauge=METRICS_GAUGES["node_mem_available"], value=mem_available, tags={"ip": ip}) mem_total_record = Record(gauge=METRICS_GAUGES["node_mem_total"], value=mem_total, tags={"ip": ip}) # -- GPU per node -- gpus = stats["gpus"] gpus_available = len(gpus) if gpus_available: gpus_utilization, gram_used, gram_total = 0, 0, 0 for gpu in gpus: gpus_utilization += gpu["utilization_gpu"] gram_used += gpu["memory_used"] gram_total += gpu["memory_total"] gram_available = gram_total - gram_used gpus_available_record = Record( gauge=METRICS_GAUGES["node_gpus_available"], value=gpus_available, tags={"ip": ip}) gpus_utilization_record = Record( gauge=METRICS_GAUGES["node_gpus_utilization"], value=gpus_utilization, tags={"ip": ip}) gram_used_record = Record(gauge=METRICS_GAUGES["node_gram_used"], value=gram_used, tags={"ip": ip}) gram_available_record = Record( gauge=METRICS_GAUGES["node_gram_available"], value=gram_available, tags={"ip": ip}) records_reported.extend([ gpus_available_record, gpus_utilization_record, gram_used_record, gram_available_record ]) # -- Disk per node -- used, free = 0, 0 for entry in stats["disk"].values(): used += entry.used free += entry.free disk_utilization = float(used / (used + free)) * 100 disk_usage_record = Record(gauge=METRICS_GAUGES["node_disk_usage"], value=used, tags={"ip": ip}) disk_free_record = Record(gauge=METRICS_GAUGES["node_disk_free"], value=free, tags={"ip": ip}) disk_utilization_percentage_record = Record( gauge=METRICS_GAUGES["node_disk_utilization_percentage"], value=disk_utilization, tags={"ip": ip}) # -- Network speed (send/receive) stats per node -- network_stats = stats["network"] network_sent_record = Record(gauge=METRICS_GAUGES["node_network_sent"], value=network_stats[0], tags={"ip": ip}) network_received_record = Record( gauge=METRICS_GAUGES["node_network_received"], value=network_stats[1], tags={"ip": ip}) # -- Network speed (send/receive) per node -- network_speed_stats = stats["network_speed"] network_send_speed_record = Record( gauge=METRICS_GAUGES["node_network_send_speed"], value=network_speed_stats[0], tags={"ip": ip}) network_receive_speed_record = Record( gauge=METRICS_GAUGES["node_network_receive_speed"], value=network_speed_stats[1], tags={"ip": ip}) raylet_stats = stats["raylet"] if raylet_stats: raylet_pid = str(raylet_stats["pid"]) # -- raylet CPU -- raylet_cpu_usage = float(raylet_stats["cpu_percent"]) * 100 raylet_cpu_record = Record(gauge=METRICS_GAUGES["raylet_cpu"], value=raylet_cpu_usage, tags={ "ip": ip, "pid": raylet_pid }) # -- raylet mem -- raylet_mem_usage = float(raylet_stats["memory_info"].rss) / 1e6 raylet_mem_record = Record(gauge=METRICS_GAUGES["raylet_mem"], value=raylet_mem_usage, tags={ "ip": ip, "pid": raylet_pid }) records_reported.extend([raylet_cpu_record, raylet_mem_record]) records_reported.extend([ cpu_record, cpu_count_record, mem_used_record, mem_available_record, mem_total_record, disk_usage_record, disk_free_record, disk_utilization_percentage_record, network_sent_record, network_received_record, network_send_speed_record, network_receive_speed_record ]) return records_reported