def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" docker_metrics_path = log_dir + "/docker.prom" time_metrics_path = log_dir + "/time.prom" time_sleep_s = int(argv[1]) iter = 0 gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info, name="gpu_singleton") docker_status_singleton = utils.Singleton(collect_docker_daemon_status, name="docker_singleton") type1_zombies = ZombieRecorder() type2_zombies = ZombieRecorder() while True: start = datetime.datetime.now() try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = gpu_singleton.try_get() docker_status = docker_status_singleton.try_get() if docker_status is not None: utils.export_metrics_to_file(docker_metrics_path, [docker_status]) gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) all_conns = network.iftop() logger.debug("iftop result is %s", all_conns) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") finally: end = datetime.datetime.now() time_metrics = [ Metric("job_exporter_iteration_seconds", {}, (end - start).seconds) ] utils.export_metrics_to_file(time_metrics_path, time_metrics) time.sleep(time_sleep_s)
def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" time_sleep_s = int(argv[1]) iter = 0 singleton = utils.Singleton(gpu_exporter.collect_gpu_info) type1_zombies = ZombieRecorder() type2_zombies = ZombieRecorder() while True: try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = singleton.try_get() gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) all_conns = network.iftop() logger.debug("iftop result is %s", all_conns) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") time.sleep(time_sleep_s)
def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" time_sleep_s = int(argv[1]) iter = 0 singleton = utils.Singleton(gpu_exporter.collect_gpu_info) while True: try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = singleton.try_get() gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") time.sleep(time_sleep_s)
def test_singleton_with_blocking_getter_allow_old_data(self): semaphore = threading.Semaphore(1) def blocking_getter(): semaphore.acquire(blocking=True) semaphore.release() return 100 singleton = utils.Singleton(blocking_getter, get_timeout_s=0.2, old_data_timeout_s=30) semaphore.acquire() for _ in xrange(3): self.assertIsNone(singleton.try_get()) semaphore.release() # let singleton cache one value self.assertEqual(100, singleton.try_get()) for _ in xrange(3): semaphore.acquire() for _ in xrange(3): # singleton returns old value self.assertEqual(100, singleton.try_get()) semaphore.release() self.assertEqual(100, singleton.try_get())
def test_singleton_with_blocking_getter_no_old_data(self): semaphore = threading.Semaphore(1) def blocking_getter(): semaphore.acquire(blocking=True) semaphore.release() return 100 singleton = utils.Singleton(blocking_getter, get_timeout_s=0.2) val, is_old = singleton.try_get() self.assertIsNotNone(val) self.assertFalse(is_old) for _ in xrange(3): semaphore.acquire() for _ in xrange(3): val, is_old = singleton.try_get() self.assertEqual(100, val) self.assertTrue(is_old) semaphore.release() val, is_old = singleton.try_get() self.assertEqual(100, val) self.assertFalse(is_old)
def test_singleton_normal(self): def getter(): return 100 singleton = utils.Singleton(getter) for _ in xrange(10): self.assertEqual(100, singleton.try_get())
def test_singleton_normal(self): def getter(): return 100 singleton = utils.Singleton(getter) for _ in xrange(10): val, is_old = singleton.try_get() self.assertEqual(100, val) self.assertFalse(is_old)
def main(argv): logDir = argv[0] gpuMetricsPath = logDir + "/gpu_exporter.prom" jobMetricsPath = logDir + "/job_exporter.prom" timeSleep = int(argv[1]) rootLogger = logging.getLogger() rootLogger.setLevel(logging.INFO) fh = RotatingFileHandler(logDir + "/gpu_exporter.log", maxBytes=1024 * 1024 * 10, backupCount=5) fh.setLevel(logging.INFO) formatter = logging.Formatter( "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s") fh.setFormatter(formatter) rootLogger.addHandler(fh) iter = 0 singleton = utils.Singleton(gpu_exporter.collect_gpu_info) while True: try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpuInfos = singleton.try_get() gpuMetrics = gpu_exporter.convert_gpu_info_to_metrics(gpuInfos) if gpuMetrics is not None: utils.export_metrics_to_file(gpuMetricsPath, gpuMetrics) # join with docker stats metrics and docker inspect labels jobMetrics = collect_job_metrics(gpuInfos) if jobMetrics is not None: utils.export_metrics_to_file(jobMetricsPath, jobMetrics) except Exception as e: logger.exception("exception in job exporter loop") time.sleep(timeSleep)