コード例 #1
0
    def setUp(self):
        # Because prometheus forbid same metric name, and we generate metric
        # in from name, we need to differentiate name using time.
        t = str(time.time()).replace(".", "_")

        _, self.collector = collector.instantiate_collector(
            "test_zombie_collector" + t, 0.5, collector.ZombieCollector,
            collector.AtomicRef(), collector.AtomicRef())
コード例 #2
0
def main(args):

    register_stack_trace_dump()
    burninate_gc_collector()
    config_environ()

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    decay_time = datetime.timedelta(seconds=args.interval * 2)

    # used to exchange gpu info between GpuCollector and ContainerCollector
    npu_info_ref = collector.AtomicRef(decay_time)
    nvidia_info_ref = collector.AtomicRef(decay_time)

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef(decay_time)

    # used to exchange zombie info between GpuCollector and ZombieCollector
    zombie_info_ref = collector.AtomicRef(decay_time)

    # used to exchange dcgm info between DCGMCollector and ContainerCollector
    dcgm_info_ref = collector.AtomicRef(decay_time)

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("npu_collector", interval, decay_time, collector.NpuCollector,
         npu_info_ref, zombie_info_ref, args.threshold),
        ("docker_daemon_collector", interval, decay_time,
         collector.DockerCollector),
        ("gpu_collector", interval, decay_time, collector.GpuCollector,
         nvidia_info_ref, zombie_info_ref, args.threshold),
        ("container_collector", max(0, interval - 18), decay_time,
         collector.ContainerCollector, nvidia_info_ref, stats_info_ref,
         args.interface, npu_info_ref, dcgm_info_ref),
        ("zombie_collector", interval, decay_time, collector.ZombieCollector,
         stats_info_ref, zombie_info_ref),
        ("process_collector", interval, decay_time,
         collector.ProcessCollector),
        ("dcgm_collector", interval, decay_time, collector.DCGMCollector,
         dcgm_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    root = Resource()
    root.putChild(b"metrics", MetricsResource())
    root.putChild(b"healthz", HealthResource())
    factory = Site(root)
    reactor.listenTCP(int(args.port), factory)
    reactor.run()
コード例 #3
0
def main(args):
    register_stack_trace_dump()
    burninate_gc_collector()
    config_environ()
    try_remove_old_prom_file(args.log + "/gpu_exporter.prom")
    try_remove_old_prom_file(args.log + "/job_exporter.prom")
    try_remove_old_prom_file(args.log + "/docker.prom")
    try_remove_old_prom_file(args.log + "/time.prom")
    try_remove_old_prom_file(args.log + "/configured_gpu.prom")

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    # used to exchange gpu info between GpuCollector and ContainerCollector
    gpu_info_ref = collector.AtomicRef()

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef()

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("docker_daemon_collector", interval, collector.DockerCollector),
        ("gpu_collector", interval / 2, collector.GpuCollector, gpu_info_ref),
        ("container_collector", interval - 18, collector.ContainerCollector,
         gpu_info_ref, stats_info_ref, args.interface),
        ("zombie_collector", interval, collector.ZombieCollector,
         stats_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    root = Resource()
    root.putChild(b"metrics", MetricsResource())
    root.putChild(b"healthz", HealthResource())
    factory = Site(root)
    reactor.listenTCP(int(args.port), factory)
    reactor.run()
コード例 #4
0
def main(args):
    config_environ()
    try_remove_old_prom_file(args.log + "/gpu_exporter.prom")
    try_remove_old_prom_file(args.log + "/job_exporter.prom")
    try_remove_old_prom_file(args.log + "/docker.prom")
    try_remove_old_prom_file(args.log + "/time.prom")
    try_remove_old_prom_file(args.log + "/configured_gpu.prom")

    configured_gpu_counter.set(
        get_gpu_count("/gpu-config/gpu-configuration.json"))

    # used to exchange gpu info between GpuCollector and ContainerCollector
    gpu_info_ref = collector.AtomicRef()

    # used to exchange docker stats info between ContainerCollector and ZombieCollector
    stats_info_ref = collector.AtomicRef()

    interval = args.interval
    # Because all collector except container_collector will spent little time in calling
    # external command to get metrics, so they need to sleep 30s to align with prometheus
    # scrape interval. The 99th latency of container_collector loop is around 20s, so it
    # should only sleep 10s to adapt to scrape interval
    collector_args = [
        ("docker_daemon_collector", interval, collector.DockerCollector),
        ("gpu_collector", interval, collector.GpuCollector, gpu_info_ref),
        ("container_collector", interval - 18, collector.ContainerCollector,
         gpu_info_ref, stats_info_ref, args.interface),
        ("zombie_collector", interval, collector.ZombieCollector,
         stats_info_ref),
    ]

    refs = list(map(lambda x: collector.make_collector(*x), collector_args))

    REGISTRY.register(CustomCollector(refs))

    app = make_wsgi_app(REGISTRY)
    httpd = make_server("", int(args.port), app)
    httpd.serve_forever()
コード例 #5
0
    def test_expiration(self):
        ref = collector.AtomicRef(datetime.timedelta(seconds=10))

        now = datetime.datetime.now()

        delta = datetime.timedelta(seconds=1)

        ref.set(1, now)

        self.assertEquals(1, ref.get(now))
        self.assertEquals(1, ref.get(now - delta))
        self.assertEquals(1, ref.get(now + delta))
        self.assertEquals(1, ref.get(now + delta * 10))
        self.assertEquals(None, ref.get(now + delta * 11))
        self.assertEquals(1, ref.get(now + delta * 10))

        ref.set(2, now + delta)
        self.assertEquals(2, ref.get(now))
        self.assertEquals(2, ref.get(now + delta * 10))
        self.assertEquals(2, ref.get(now + delta * 11))
        self.assertEquals(None, ref.get(now + delta * 12))
コード例 #6
0
        format=
        "%(asctime)s - %(levelname)s - %(threadName)s - %(filename)s:%(lineno)s - %(message)s",
        level="DEBUG")

    import collector
    import datetime

    from prometheus_client import Histogram

    cmd_histogram = Histogram("cmd_dcgmi_latency_seconds",
                              "Command call latency for nvidia-smi (seconds)",
                              buckets=(1.0, 2.0, 4.0, 8.0, 16.0, 32.0,
                                       64.0, 128.0, 256.0, 512.0, 1024.0,
                                       float("inf")))

    gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60))
    metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60))

    dcgm_handler = dcgm.DCGMHandler(1, self.gauge_ref, metric_ref,
                                    cmd_histogram, 600)
    dcgm_handler.run()

    for _ in range(10):
        now = datetime.datetime.now()

        gauge = gauge_ref.get(now)
        metric = metric_ref.get(now)

        logger.info("gauge is %s", gauge)
        logger.info("metric is %s", metric)
        time.sleep(2)
コード例 #7
0
            return metric, gauge
        except:
            logger.exception("getting ipoib metric failed")
            return metric, gauge


if __name__ == '__main__':
    logging.basicConfig(
        format=
        "%(asctime)s - %(levelname)s - %(threadName)s - %(filename)s:%(lineno)s - %(message)s",
        level="DEBUG")

    import collector
    import datetime

    infiniband_gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60))
    infiniband_metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60))

    infiniband_handler = InfinibandHandler(1, infiniband_gauge_ref,
                                           infiniband_metric_ref)
    infiniband_handler.start()

    ipoib_gauge_ref = collector.AtomicRef(datetime.timedelta(seconds=60))
    ipoib_metric_ref = collector.AtomicRef(datetime.timedelta(seconds=60))

    ipoib_handler = IPoIBHandler(1, ipoib_gauge_ref, ipoib_metric_ref)
    ipoib_handler.start()

    for _ in range(10):
        now = datetime.datetime.now()