Example #1
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
Example #2
0
    def test_convert_gpu_info_to_metrics(self):
        info = {
            '1': {
                'gpuUtil': u'98',
                'gpuMemUtil': u'97'
            },
            '0': {
                'gpuUtil': u'100',
                'gpuMemUtil': u'99'
            }
        }
        metrics = gpu_exporter.convert_gpu_info_to_metrics(info)
        self.assertEqual(5, len(metrics))

        self.assertIn(Metric("nvidiasmi_attached_gpus", {}, 2), metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_gpu", {"minor_number": "0"}, "100"),
            metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_memory", {"minor_number": "0"},
                   "99"), metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_gpu", {"minor_number": "1"}, "98"),
            metrics)
        self.assertIn(
            Metric("nvidiasmi_utilization_memory", {"minor_number": "1"},
                   "97"), metrics)
Example #3
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
Example #4
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    docker_metrics_path = log_dir + "/docker.prom"
    time_metrics_path = log_dir + "/time.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info,
                                    name="gpu_singleton")
    docker_status_singleton = utils.Singleton(collect_docker_daemon_status,
                                              name="docker_singleton")

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        start = datetime.datetime.now()
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = gpu_singleton.try_get()

            docker_status = docker_status_singleton.try_get()
            if docker_status is not None:
                utils.export_metrics_to_file(docker_metrics_path,
                                             [docker_status])

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")
        finally:
            end = datetime.datetime.now()

            time_metrics = [
                Metric("job_exporter_iteration_seconds", {},
                       (end - start).seconds)
            ]
            utils.export_metrics_to_file(time_metrics_path, time_metrics)

        time.sleep(time_sleep_s)
Example #5
0
def main(argv):
    logDir = argv[0]
    gpuMetricsPath = logDir + "/gpu_exporter.prom"
    jobMetricsPath = logDir + "/job_exporter.prom"
    timeSleep = int(argv[1])

    rootLogger = logging.getLogger()
    rootLogger.setLevel(logging.INFO)
    fh = RotatingFileHandler(logDir + "/gpu_exporter.log",
                             maxBytes=1024 * 1024 * 10,
                             backupCount=5)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
    fh.setFormatter(formatter)
    rootLogger.addHandler(fh)

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpuInfos = singleton.try_get()

            gpuMetrics = gpu_exporter.convert_gpu_info_to_metrics(gpuInfos)
            if gpuMetrics is not None:
                utils.export_metrics_to_file(gpuMetricsPath, gpuMetrics)

            # join with docker stats metrics and docker inspect labels
            jobMetrics = collect_job_metrics(gpuInfos)
            if jobMetrics is not None:
                utils.export_metrics_to_file(jobMetricsPath, jobMetrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(timeSleep)