Example #1
0
def main(args):

    register_stack_trace_dump()
    burninate_gc_collector()
    config_environ()

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    decay_time = datetime.timedelta(seconds=args.interval * 2)

    # used to exchange gpu info between GpuCollector and ContainerCollector
    npu_info_ref = collector.AtomicRef(decay_time)
    nvidia_info_ref = collector.AtomicRef(decay_time)

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef(decay_time)

    # used to exchange zombie info between GpuCollector and ZombieCollector
    zombie_info_ref = collector.AtomicRef(decay_time)

    # used to exchange dcgm info between DCGMCollector and ContainerCollector
    dcgm_info_ref = collector.AtomicRef(decay_time)

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("npu_collector", interval, decay_time, collector.NpuCollector,
         npu_info_ref, zombie_info_ref, args.threshold),
        ("docker_daemon_collector", interval, decay_time,
         collector.DockerCollector),
        ("gpu_collector", interval, decay_time, collector.GpuCollector,
         nvidia_info_ref, zombie_info_ref, args.threshold),
        ("container_collector", max(0, interval - 18), decay_time,
         collector.ContainerCollector, nvidia_info_ref, stats_info_ref,
         args.interface, npu_info_ref, dcgm_info_ref),
        ("zombie_collector", interval, decay_time, collector.ZombieCollector,
         stats_info_ref, zombie_info_ref),
        ("process_collector", interval, decay_time,
         collector.ProcessCollector),
        ("dcgm_collector", interval, decay_time, collector.DCGMCollector,
         dcgm_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    root = Resource()
    root.putChild(b"metrics", MetricsResource())
    root.putChild(b"healthz", HealthResource())
    factory = Site(root)
    reactor.listenTCP(int(args.port), factory)
    reactor.run()
Example #2
0
    def test_base_collector(self):
        """ actually setup DockerCollector thread, and test, since this is multi-thread
        test case, maybe sensitive to the system load """
        ref = collector.make_collector("test_docker_collector2", 0.5,
                                       collector.DockerCollector)

        metrics = None
        for i in range(10):
            metrics = ref.get()
            if metrics is not None:
                break
            time.sleep(0.1)

        self.assert_metrics(metrics)
Example #3
0
def main(args):
    register_stack_trace_dump()
    burninate_gc_collector()
    config_environ()
    try_remove_old_prom_file(args.log + "/gpu_exporter.prom")
    try_remove_old_prom_file(args.log + "/job_exporter.prom")
    try_remove_old_prom_file(args.log + "/docker.prom")
    try_remove_old_prom_file(args.log + "/time.prom")
    try_remove_old_prom_file(args.log + "/configured_gpu.prom")

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    # used to exchange gpu info between GpuCollector and ContainerCollector
    gpu_info_ref = collector.AtomicRef()

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef()

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("docker_daemon_collector", interval, collector.DockerCollector),
        ("gpu_collector", interval / 2, collector.GpuCollector, gpu_info_ref),
        ("container_collector", interval - 18, collector.ContainerCollector,
         gpu_info_ref, stats_info_ref, args.interface),
        ("zombie_collector", interval, collector.ZombieCollector,
         stats_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    root = Resource()
    root.putChild(b"metrics", MetricsResource())
    root.putChild(b"healthz", HealthResource())
    factory = Site(root)
    reactor.listenTCP(int(args.port), factory)
    reactor.run()
Example #4
0
def main(args):
    config_environ()
    try_remove_old_prom_file(args.log + "/gpu_exporter.prom")
    try_remove_old_prom_file(args.log + "/job_exporter.prom")
    try_remove_old_prom_file(args.log + "/docker.prom")
    try_remove_old_prom_file(args.log + "/time.prom")
    try_remove_old_prom_file(args.log + "/configured_gpu.prom")

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    # used to exchange gpu info between GpuCollector and ContainerCollector
    gpu_info_ref = collector.AtomicRef()

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef()

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("docker_daemon_collector", interval, collector.DockerCollector),
        ("gpu_collector", interval, collector.GpuCollector, gpu_info_ref),
        ("container_collector", interval - 18, collector.ContainerCollector,
         gpu_info_ref, stats_info_ref, args.interface),
        ("zombie_collector", interval, collector.ZombieCollector,
         stats_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    app = make_wsgi_app(REGISTRY)
    httpd = make_server("", int(args.port), app)
    httpd.serve_forever()