Example #1
0
    def _record_stats(stats):
        records_reported = []

        ip = stats["ip"]
        # -- CPU per node --
        cpu_usage = float(stats["cpu"])
        cpu_record = Record(
            gauge=METRICS_GAUGES["node_cpu_utilization"],
            value=cpu_usage,
            tags={"ip": ip})

        cpu_count, _ = stats["cpus"]
        cpu_count_record = Record(
            gauge=METRICS_GAUGES["node_cpu_count"],
            value=cpu_count,
            tags={"ip": ip})

        # -- Mem per node --
        mem_total, mem_available, _, mem_used = stats["mem"]
        mem_used_record = Record(
            gauge=METRICS_GAUGES["node_mem_used"],
            value=mem_used,
            tags={"ip": ip})
        mem_available_record = Record(
            gauge=METRICS_GAUGES["node_mem_available"],
            value=mem_available,
            tags={"ip": ip})
        mem_total_record = Record(
            gauge=METRICS_GAUGES["node_mem_total"],
            value=mem_total,
            tags={"ip": ip})

        # -- GPU per node --
        gpus = stats["gpus"]
        gpus_available = len(gpus)

        if gpus_available:
            gpus_utilization, gram_used, gram_total = 0, 0, 0
            for gpu in gpus:
                gpus_utilization += gpu["utilization_gpu"]
                gram_used += gpu["memory_used"]
                gram_total += gpu["memory_total"]

            gram_available = gram_total - gram_used

            gpus_available_record = Record(
                gauge=METRICS_GAUGES["node_gpus_available"],
                value=gpus_available,
                tags={"ip": ip})
            gpus_utilization_record = Record(
                gauge=METRICS_GAUGES["node_gpus_utilization"],
                value=gpus_utilization,
                tags={"ip": ip})
            gram_used_record = Record(
                gauge=METRICS_GAUGES["node_gram_used"],
                value=gram_used,
                tags={"ip": ip})
            gram_available_record = Record(
                gauge=METRICS_GAUGES["node_gram_available"],
                value=gram_available,
                tags={"ip": ip})
            records_reported.extend([
                gpus_available_record, gpus_utilization_record,
                gram_used_record, gram_available_record
            ])

        # -- Disk per node --
        used, free = 0, 0
        for entry in stats["disk"].values():
            used += entry.used
            free += entry.free
        disk_utilization = float(used / (used + free)) * 100
        disk_usage_record = Record(
            gauge=METRICS_GAUGES["node_disk_usage"],
            value=used,
            tags={"ip": ip})
        disk_utilization_percentage_record = Record(
            gauge=METRICS_GAUGES["node_disk_utilization_percentage"],
            value=disk_utilization,
            tags={"ip": ip})

        # -- Network speed (send/receive) stats per node --
        network_stats = stats["network"]
        network_sent_record = Record(
            gauge=METRICS_GAUGES["node_network_sent"],
            value=network_stats[0],
            tags={"ip": ip})
        network_received_record = Record(
            gauge=METRICS_GAUGES["node_network_received"],
            value=network_stats[1],
            tags={"ip": ip})

        # -- Network speed (send/receive) per node --
        network_speed_stats = stats["network_speed"]
        network_send_speed_record = Record(
            gauge=METRICS_GAUGES["node_network_send_speed"],
            value=network_speed_stats[0],
            tags={"ip": ip})
        network_receive_speed_record = Record(
            gauge=METRICS_GAUGES["node_network_receive_speed"],
            value=network_speed_stats[1],
            tags={"ip": ip})

        raylet_stats = stats["raylet"]
        if raylet_stats:
            raylet_pid = str(raylet_stats["pid"])
            # -- raylet CPU --
            raylet_cpu_usage = float(raylet_stats["cpu_percent"]) * 100
            raylet_cpu_record = Record(
                gauge=METRICS_GAUGES["raylet_cpu"],
                value=raylet_cpu_usage,
                tags={
                    "ip": ip,
                    "pid": raylet_pid
                })

            # -- raylet mem --
            raylet_mem_usage = float(raylet_stats["memory_info"].rss) / 1e6
            raylet_mem_record = Record(
                gauge=METRICS_GAUGES["raylet_mem"],
                value=raylet_mem_usage,
                tags={
                    "ip": ip,
                    "pid": raylet_pid
                })
            records_reported.extend([raylet_cpu_record, raylet_mem_record])

        records_reported.extend([
            cpu_record, cpu_count_record, mem_used_record,
            mem_available_record, mem_total_record, disk_usage_record,
            disk_utilization_percentage_record, network_sent_record,
            network_received_record, network_send_speed_record,
            network_receive_speed_record
        ])
        return records_reported
Example #2
0
    def _record_stats(self, stats, cluster_stats):
        records_reported = []
        ip = stats["ip"]

        # -- Instance count of cluster --
        # Only report cluster stats on head node
        if "autoscaler_report" in cluster_stats and self._is_head_node:
            active_nodes = cluster_stats["autoscaler_report"]["active_nodes"]
            for node_type, active_node_count in active_nodes.items():
                records_reported.append(
                    Record(gauge=METRICS_GAUGES["cluster_active_nodes"],
                           value=active_node_count,
                           tags={"node_type": node_type}))

            failed_nodes = cluster_stats["autoscaler_report"]["failed_nodes"]
            failed_nodes_dict = {}
            for node_ip, node_type in failed_nodes:
                if node_type in failed_nodes_dict:
                    failed_nodes_dict[node_type] += 1
                else:
                    failed_nodes_dict[node_type] = 1

            for node_type, failed_node_count in failed_nodes_dict.items():
                records_reported.append(
                    Record(gauge=METRICS_GAUGES["cluster_failed_nodes"],
                           value=failed_node_count,
                           tags={"node_type": node_type}))

            pending_nodes = cluster_stats["autoscaler_report"]["pending_nodes"]
            pending_nodes_dict = {}
            for node_ip, node_type, status_message in pending_nodes:
                if node_type in pending_nodes_dict:
                    pending_nodes_dict[node_type] += 1
                else:
                    pending_nodes_dict[node_type] = 1

            for node_type, pending_node_count in pending_nodes_dict.items():
                records_reported.append(
                    Record(gauge=METRICS_GAUGES["cluster_pending_nodes"],
                           value=pending_node_count,
                           tags={"node_type": node_type}))

        # -- CPU per node --
        cpu_usage = float(stats["cpu"])
        cpu_record = Record(gauge=METRICS_GAUGES["node_cpu_utilization"],
                            value=cpu_usage,
                            tags={"ip": ip})

        cpu_count, _ = stats["cpus"]
        cpu_count_record = Record(gauge=METRICS_GAUGES["node_cpu_count"],
                                  value=cpu_count,
                                  tags={"ip": ip})

        # -- Mem per node --
        mem_total, mem_available, _, mem_used = stats["mem"]
        mem_used_record = Record(gauge=METRICS_GAUGES["node_mem_used"],
                                 value=mem_used,
                                 tags={"ip": ip})
        mem_available_record = Record(
            gauge=METRICS_GAUGES["node_mem_available"],
            value=mem_available,
            tags={"ip": ip})
        mem_total_record = Record(gauge=METRICS_GAUGES["node_mem_total"],
                                  value=mem_total,
                                  tags={"ip": ip})

        # -- GPU per node --
        gpus = stats["gpus"]
        gpus_available = len(gpus)

        if gpus_available:
            gpus_utilization, gram_used, gram_total = 0, 0, 0
            for gpu in gpus:
                gpus_utilization += gpu["utilization_gpu"]
                gram_used += gpu["memory_used"]
                gram_total += gpu["memory_total"]

            gram_available = gram_total - gram_used

            gpus_available_record = Record(
                gauge=METRICS_GAUGES["node_gpus_available"],
                value=gpus_available,
                tags={"ip": ip})
            gpus_utilization_record = Record(
                gauge=METRICS_GAUGES["node_gpus_utilization"],
                value=gpus_utilization,
                tags={"ip": ip})
            gram_used_record = Record(gauge=METRICS_GAUGES["node_gram_used"],
                                      value=gram_used,
                                      tags={"ip": ip})
            gram_available_record = Record(
                gauge=METRICS_GAUGES["node_gram_available"],
                value=gram_available,
                tags={"ip": ip})
            records_reported.extend([
                gpus_available_record, gpus_utilization_record,
                gram_used_record, gram_available_record
            ])

        # -- Disk per node --
        used, free = 0, 0
        for entry in stats["disk"].values():
            used += entry.used
            free += entry.free
        disk_utilization = float(used / (used + free)) * 100
        disk_usage_record = Record(gauge=METRICS_GAUGES["node_disk_usage"],
                                   value=used,
                                   tags={"ip": ip})
        disk_free_record = Record(gauge=METRICS_GAUGES["node_disk_free"],
                                  value=free,
                                  tags={"ip": ip})
        disk_utilization_percentage_record = Record(
            gauge=METRICS_GAUGES["node_disk_utilization_percentage"],
            value=disk_utilization,
            tags={"ip": ip})

        # -- Network speed (send/receive) stats per node --
        network_stats = stats["network"]
        network_sent_record = Record(gauge=METRICS_GAUGES["node_network_sent"],
                                     value=network_stats[0],
                                     tags={"ip": ip})
        network_received_record = Record(
            gauge=METRICS_GAUGES["node_network_received"],
            value=network_stats[1],
            tags={"ip": ip})

        # -- Network speed (send/receive) per node --
        network_speed_stats = stats["network_speed"]
        network_send_speed_record = Record(
            gauge=METRICS_GAUGES["node_network_send_speed"],
            value=network_speed_stats[0],
            tags={"ip": ip})
        network_receive_speed_record = Record(
            gauge=METRICS_GAUGES["node_network_receive_speed"],
            value=network_speed_stats[1],
            tags={"ip": ip})

        raylet_stats = stats["raylet"]
        if raylet_stats:
            raylet_pid = str(raylet_stats["pid"])
            # -- raylet CPU --
            raylet_cpu_usage = float(raylet_stats["cpu_percent"]) * 100
            raylet_cpu_record = Record(gauge=METRICS_GAUGES["raylet_cpu"],
                                       value=raylet_cpu_usage,
                                       tags={
                                           "ip": ip,
                                           "pid": raylet_pid
                                       })

            # -- raylet mem --
            raylet_mem_usage = float(raylet_stats["memory_info"].rss) / 1e6
            raylet_mem_record = Record(gauge=METRICS_GAUGES["raylet_mem"],
                                       value=raylet_mem_usage,
                                       tags={
                                           "ip": ip,
                                           "pid": raylet_pid
                                       })
            records_reported.extend([raylet_cpu_record, raylet_mem_record])

        records_reported.extend([
            cpu_record, cpu_count_record, mem_used_record,
            mem_available_record, mem_total_record, disk_usage_record,
            disk_free_record, disk_utilization_percentage_record,
            network_sent_record, network_received_record,
            network_send_speed_record, network_receive_speed_record
        ])
        return records_reported