def setUp(self): # Because prometheus forbid same metric name, and we generate metric # in from name, we need to differentiate name using time. t = str(time.time()).replace(".", "_") _, self.collector = collector.instantiate_collector( "test_zombie_collector" + t, 0.5, collector.ZombieCollector, collector.AtomicRef(), collector.AtomicRef())
def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) decay_time = datetime.timedelta(seconds=args.interval * 2) # used to exchange gpu info between GpuCollector and ContainerCollector npu_info_ref = collector.AtomicRef(decay_time) nvidia_info_ref = collector.AtomicRef(decay_time) # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef(decay_time) # used to exchange zombie info between GpuCollector and ZombieCollector zombie_info_ref = collector.AtomicRef(decay_time) # used to exchange dcgm info between DCGMCollector and ContainerCollector dcgm_info_ref = collector.AtomicRef(decay_time) interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("npu_collector", interval, decay_time, collector.NpuCollector, npu_info_ref, zombie_info_ref, args.threshold), ("docker_daemon_collector", interval, decay_time, collector.DockerCollector), ("gpu_collector", interval, decay_time, collector.GpuCollector, nvidia_info_ref, zombie_info_ref, args.threshold), ("container_collector", max(0, interval - 18), decay_time, collector.ContainerCollector, nvidia_info_ref, stats_info_ref, args.interface, npu_info_ref, dcgm_info_ref), ("zombie_collector", interval, decay_time, collector.ZombieCollector, stats_info_ref, zombie_info_ref), ("process_collector", interval, decay_time, collector.ProcessCollector), ("dcgm_collector", interval, decay_time, collector.DCGMCollector, dcgm_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() try_remove_old_prom_file(args.log + "/gpu_exporter.prom") try_remove_old_prom_file(args.log + "/job_exporter.prom") try_remove_old_prom_file(args.log + "/docker.prom") try_remove_old_prom_file(args.log + "/time.prom") try_remove_old_prom_file(args.log + "/configured_gpu.prom") configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) # used to exchange gpu info between GpuCollector and ContainerCollector gpu_info_ref = collector.AtomicRef() # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef() interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("docker_daemon_collector", interval, collector.DockerCollector), ("gpu_collector", interval / 2, collector.GpuCollector, gpu_info_ref), ("container_collector", interval - 18, collector.ContainerCollector, gpu_info_ref, stats_info_ref, args.interface), ("zombie_collector", interval, collector.ZombieCollector, stats_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def main(args): config_environ() try_remove_old_prom_file(args.log + "/gpu_exporter.prom") try_remove_old_prom_file(args.log + "/job_exporter.prom") try_remove_old_prom_file(args.log + "/docker.prom") try_remove_old_prom_file(args.log + "/time.prom") try_remove_old_prom_file(args.log + "/configured_gpu.prom") configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) # used to exchange gpu info between GpuCollector and ContainerCollector gpu_info_ref = collector.AtomicRef() # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef() interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("docker_daemon_collector", interval, collector.DockerCollector), ("gpu_collector", interval, collector.GpuCollector, gpu_info_ref), ("container_collector", interval - 18, collector.ContainerCollector, gpu_info_ref, stats_info_ref, args.interface), ("zombie_collector", interval, collector.ZombieCollector, stats_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) app = make_wsgi_app(REGISTRY) httpd = make_server("", int(args.port), app) httpd.serve_forever()
def test_expiration(self): ref = collector.AtomicRef(datetime.timedelta(seconds=10)) now = datetime.datetime.now() delta = datetime.timedelta(seconds=1) ref.set(1, now) self.assertEquals(1, ref.get(now)) self.assertEquals(1, ref.get(now - delta)) self.assertEquals(1, ref.get(now + delta)) self.assertEquals(1, ref.get(now + delta * 10)) self.assertEquals(None, ref.get(now + delta * 11)) self.assertEquals(1, ref.get(now + delta * 10)) ref.set(2, now + delta) self.assertEquals(2, ref.get(now)) self.assertEquals(2, ref.get(now + delta * 10)) self.assertEquals(2, ref.get(now + delta * 11)) self.assertEquals(None, ref.get(now + delta * 12))
format= "%(asctime)s - %(levelname)s - %(threadName)s - %(filename)s:%(lineno)s - %(message)s", level="DEBUG") import collector import datetime from prometheus_client import Histogram cmd_histogram = Histogram("cmd_dcgmi_latency_seconds", "Command call latency for nvidia-smi (seconds)", buckets=(1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, float("inf"))) gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) dcgm_handler = dcgm.DCGMHandler(1, self.gauge_ref, metric_ref, cmd_histogram, 600) dcgm_handler.run() for _ in range(10): now = datetime.datetime.now() gauge = gauge_ref.get(now) metric = metric_ref.get(now) logger.info("gauge is %s", gauge) logger.info("metric is %s", metric) time.sleep(2)
return metric, gauge except: logger.exception("getting ipoib metric failed") return metric, gauge if __name__ == '__main__': logging.basicConfig( format= "%(asctime)s - %(levelname)s - %(threadName)s - %(filename)s:%(lineno)s - %(message)s", level="DEBUG") import collector import datetime infiniband_gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) infiniband_metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) infiniband_handler = InfinibandHandler(1, infiniband_gauge_ref, infiniband_metric_ref) infiniband_handler.start() ipoib_gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) ipoib_metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60)) ipoib_handler = IPoIBHandler(1, ipoib_gauge_ref, ipoib_metric_ref) ipoib_handler.start() for _ in range(10): now = datetime.datetime.now()