def __init__(self, config: ml_monitor.Config): logging.debug("Creating logging thread...") self.config = config self.monitor_values = defaultdict(list) self.metrics_log_file = self.config.logging_file self.thread = None self.thread_running = False self.pre_log_hooks: List[ml_monitor.BaseHook] = []
def _parse_config(self): logging.debug("Parsing configuration file...") self.config_title = self.config.get( "title", list(filter(None, os.getcwd().split("/")))[-1]) self.files_location = self.config.get("files_location", "local") self.metrics_log_file = self.config.get("metrics_log_file") self.log_interval_sec = self.config.get("log_interval_sec")
def _sync(self): logging.debug("Starting Google Colab synchronization...") fetch_interval_sec = self.config.fetch_interval_sec or 3 logging.debug( f"Fetching interval set as {fetch_interval_sec} seconds.") gdrive_fetcher_thread = GDriveFetcher( self.gdrive, self.config, fetch_interval_sec=fetch_interval_sec) gdrive_fetcher_thread.start() return gdrive_fetcher_thread
def fetch(self): try: logging.debug("Fetching metrics from Google Drive...") with prometheus.fetching_duration.time(): self.gdrive.download(self.remote_metrics_log_file, self.local_log_file) except Exception as e: logging.error( f"Exception raised while fetching files from Google Drive:\n{e}" ) logging.error("Stopping fetching thread due to the exception.") self.stop()
def start(self): try: if self.metrics_collector is None: logging.debug("Starting Prometheus metrics collector..") self.metrics_collector = ml_monitor.prometheus.MetricsCollector( self.config) self.metrics_collector.run() except Exception as e: logging.error( f"Exception raised, stopping metrics collecting.\n{e}") self.stop() raise e
def _create_log_file(self): logging.debug(f"Creating logging file {self.metrics_log_file}...") if not os.path.exists(self.metrics_log_file): try: logging.info(f"Creating log file {self.metrics_log_file}") os.makedirs(os.path.dirname(self.metrics_log_file), exist_ok=True) with open(self.metrics_log_file, "w") as f: json.dump({}, f) except Exception as e: raise Exception( f"Could not create log file {self.metrics_log_file}.\n {e}" )
def _load_config_file(self): logging.debug(f"Loading configuration file {self.config_file}...") with open(self.config_file, "r") as config_file: try: config = yaml.safe_load(config_file) except yaml.YAMLError: try: config = json.load(config_file) except Exception as e: raise Exception( f"Could not load configuration file {self.config_file}\n{e}" ) return config
def __init__(self, config_file=None): if config_file is None: config_file = os.path.join(os.path.dirname(__file__), "config.yml") logging.debug(f"Configuring module using {config_file}...") self.config_file = config_file self.config = self._load_config_file() # Variables to be read from config file self.config_title = None self.files_location = None self.metrics_log_file = None self.log_interval_sec = None self._parse_config() self._create_log_file()
def hook(self): logging.debug("Registering resources utilization") GPUs = GPUtil.getGPUs() gpu = GPUs[0] utilization_metrics = { "colab_GPU_mem_free": gpu.memoryFree, "colab_GPU_mem_used": gpu.memoryUsed, "colab_GPU_mem_util_percentage": gpu.memoryUtil * 100, "colab_GPU_mem_total": gpu.memoryTotal, "colab_RAM_used_percentage": psutil.virtual_memory().percent, "colab_RAM_total_MB": psutil.virtual_memory().total / (1024 * 1024), } self.monitor("pull_metrics", utilization_metrics)
def log(self): logging.debug("Serializing metrics...") for hook in self.pre_log_hooks: logging.debug("Applying hook") hook() self.monitor_values["title"] = self.config.title or "ml_monitor" try: with open(self.metrics_log_file, "w") as f: json.dump(self.monitor_values, f) self.clean() except Exception as e: logging.error(f"Error while serializing metrics: {e}") logging.error("Stopping serialization thread") self.thread.cancel() self.thread_running = False
def _resolve_log_file(self): logging.debug("Resolving log file location...") if self.config.gdrive_log_file is not None: return self.config.gdrive_log_file remote_metrics_log_file = self.config.remote_metrics_log_file file_location_parts = list(filter(None, remote_metrics_log_file.split("/"))) gdrive_loc = "" for part in file_location_parts[::-1]: gdrive_loc = f"/{part}{gdrive_loc}" if self.gdrive.get(gdrive_loc) is not None: logging.info( f"Google Drive log file location resolved as {gdrive_loc}." ) return gdrive_loc raise Exception( f"Could not resolve log file location from config: {remote_metrics_log_file}" )
def hook(self): logging.debug("Registering resources utilization") GPUs = GPUtil.getGPUs() utilization_metrics = { "RAM_used_percentage": psutil.virtual_memory().percent, "CPU_usage": psutil.cpu_percent(), } if len(GPUs): gpu = GPUs[0] utilization_metrics.update( { "GPU_mem_free": gpu.memoryFree, "GPU_mem_used": gpu.memoryUsed, "GPU_memb_util_percentage": gpu.memoryUtil * 100, "GPU_mem_total": gpu.memoryTotal, } ) self.monitor("pull_metrics", utilization_metrics)
def start(self): logging.debug("Starting Google Drive featching thread.") if not self.thread_running: self.thread = threading.Timer(self.fetch_interval_sec, self._run_thread) self.thread.start() self.thread_running = True
def clean(self): logging.debug("Removing monitored metrics") self.monitor_values = defaultdict(list)
def monitor(self, name: str, value): logging.debug(f"Receive metric: {name} with value: {value}") self.monitor_values[name].append(value)