コード例 #1
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
コード例 #2
0
ファイル: job_exporter.py プロジェクト: luckyqsz/pai
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
コード例 #3
0
ファイル: test_utils.py プロジェクト: yanli2017/pai
 def test_export_metrics_to_file(self):
     metrics = []
     metrics.append(Metric("foo", {"bar": 2}, "3"))
     metrics.append(Metric("bar", {}, "4"))
     with tempfile.NamedTemporaryFile() as f:
         utils.export_metrics_to_file(f.name, metrics)
         lines = f.readlines()
         self.assertEqual("foo{bar=\"2\"} 3", lines[0].strip())
         self.assertEqual("bar 4", lines[1].strip())
コード例 #4
0
ファイル: job_exporter.py プロジェクト: jiapei100/pai
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    docker_metrics_path = log_dir + "/docker.prom"
    time_metrics_path = log_dir + "/time.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info,
                                    name="gpu_singleton")
    docker_status_singleton = utils.Singleton(collect_docker_daemon_status,
                                              name="docker_singleton")

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        start = datetime.datetime.now()
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = gpu_singleton.try_get()

            docker_status = docker_status_singleton.try_get()
            if docker_status is not None:
                utils.export_metrics_to_file(docker_metrics_path,
                                             [docker_status])

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")
        finally:
            end = datetime.datetime.now()

            time_metrics = [
                Metric("job_exporter_iteration_seconds", {},
                       (end - start).seconds)
            ]
            utils.export_metrics_to_file(time_metrics_path, time_metrics)

        time.sleep(time_sleep_s)
コード例 #5
0
ファイル: job_exporter.py プロジェクト: zweiustc/pai
def main(argv):
    logDir = argv[0]
    gpuMetricsPath = logDir + "/gpu_exporter.prom"
    jobMetricsPath = logDir + "/job_exporter.prom"
    timeSleep = int(argv[1])

    rootLogger = logging.getLogger()
    rootLogger.setLevel(logging.INFO)
    fh = RotatingFileHandler(logDir + "/gpu_exporter.log",
                             maxBytes=1024 * 1024 * 10,
                             backupCount=5)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
    fh.setFormatter(formatter)
    rootLogger.addHandler(fh)

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpuInfos = singleton.try_get()

            gpuMetrics = gpu_exporter.convert_gpu_info_to_metrics(gpuInfos)
            if gpuMetrics is not None:
                utils.export_metrics_to_file(gpuMetricsPath, gpuMetrics)

            # join with docker stats metrics and docker inspect labels
            jobMetrics = collect_job_metrics(gpuInfos)
            if jobMetrics is not None:
                utils.export_metrics_to_file(jobMetricsPath, jobMetrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(timeSleep)
コード例 #6
0
ファイル: watchdog.py プロジェクト: yangmian7721/pai
def log_and_export_metrics(path, metrics):
    utils.export_metrics_to_file(path, metrics)
    for metric in metrics:
        logger.info(metric)