Ejemplo n.º 1
0
 def __init__(self, config: ml_monitor.Config):
     logging.debug("Creating logging thread...")
     self.config = config
     self.monitor_values = defaultdict(list)
     self.metrics_log_file = self.config.logging_file
     self.thread = None
     self.thread_running = False
     self.pre_log_hooks: List[ml_monitor.BaseHook] = []
Ejemplo n.º 2
0
 def _parse_config(self):
     logging.debug("Parsing configuration file...")
     self.config_title = self.config.get(
         "title",
         list(filter(None,
                     os.getcwd().split("/")))[-1])
     self.files_location = self.config.get("files_location", "local")
     self.metrics_log_file = self.config.get("metrics_log_file")
     self.log_interval_sec = self.config.get("log_interval_sec")
Ejemplo n.º 3
0
 def _sync(self):
     logging.debug("Starting Google Colab synchronization...")
     fetch_interval_sec = self.config.fetch_interval_sec or 3
     logging.debug(
         f"Fetching interval set as {fetch_interval_sec} seconds.")
     gdrive_fetcher_thread = GDriveFetcher(
         self.gdrive, self.config, fetch_interval_sec=fetch_interval_sec)
     gdrive_fetcher_thread.start()
     return gdrive_fetcher_thread
Ejemplo n.º 4
0
 def fetch(self):
     try:
         logging.debug("Fetching metrics from Google Drive...")
         with prometheus.fetching_duration.time():
             self.gdrive.download(self.remote_metrics_log_file, self.local_log_file)
     except Exception as e:
         logging.error(
             f"Exception raised while fetching files from Google Drive:\n{e}"
         )
         logging.error("Stopping fetching thread due to the exception.")
         self.stop()
Ejemplo n.º 5
0
 def start(self):
     try:
         if self.metrics_collector is None:
             logging.debug("Starting Prometheus metrics collector..")
             self.metrics_collector = ml_monitor.prometheus.MetricsCollector(
                 self.config)
             self.metrics_collector.run()
     except Exception as e:
         logging.error(
             f"Exception raised, stopping metrics collecting.\n{e}")
         self.stop()
         raise e
Ejemplo n.º 6
0
 def _create_log_file(self):
     logging.debug(f"Creating logging file {self.metrics_log_file}...")
     if not os.path.exists(self.metrics_log_file):
         try:
             logging.info(f"Creating log file {self.metrics_log_file}")
             os.makedirs(os.path.dirname(self.metrics_log_file),
                         exist_ok=True)
             with open(self.metrics_log_file, "w") as f:
                 json.dump({}, f)
         except Exception as e:
             raise Exception(
                 f"Could not create log file {self.metrics_log_file}.\n {e}"
             )
Ejemplo n.º 7
0
 def _load_config_file(self):
     logging.debug(f"Loading configuration file {self.config_file}...")
     with open(self.config_file, "r") as config_file:
         try:
             config = yaml.safe_load(config_file)
         except yaml.YAMLError:
             try:
                 config = json.load(config_file)
             except Exception as e:
                 raise Exception(
                     f"Could not load configuration file {self.config_file}\n{e}"
                 )
     return config
Ejemplo n.º 8
0
    def __init__(self, config_file=None):
        if config_file is None:
            config_file = os.path.join(os.path.dirname(__file__), "config.yml")
        logging.debug(f"Configuring module using {config_file}...")
        self.config_file = config_file
        self.config = self._load_config_file()
        # Variables to be read from config file
        self.config_title = None
        self.files_location = None
        self.metrics_log_file = None
        self.log_interval_sec = None

        self._parse_config()
        self._create_log_file()
Ejemplo n.º 9
0
 def hook(self):
     logging.debug("Registering resources utilization")
     GPUs = GPUtil.getGPUs()
     gpu = GPUs[0]
     utilization_metrics = {
         "colab_GPU_mem_free": gpu.memoryFree,
         "colab_GPU_mem_used": gpu.memoryUsed,
         "colab_GPU_mem_util_percentage": gpu.memoryUtil * 100,
         "colab_GPU_mem_total": gpu.memoryTotal,
         "colab_RAM_used_percentage": psutil.virtual_memory().percent,
         "colab_RAM_total_MB":
         psutil.virtual_memory().total / (1024 * 1024),
     }
     self.monitor("pull_metrics", utilization_metrics)
Ejemplo n.º 10
0
 def log(self):
     logging.debug("Serializing metrics...")
     for hook in self.pre_log_hooks:
         logging.debug("Applying hook")
         hook()
     self.monitor_values["title"] = self.config.title or "ml_monitor"
     try:
         with open(self.metrics_log_file, "w") as f:
             json.dump(self.monitor_values, f)
         self.clean()
     except Exception as e:
         logging.error(f"Error while serializing metrics: {e}")
         logging.error("Stopping serialization thread")
         self.thread.cancel()
         self.thread_running = False
Ejemplo n.º 11
0
    def _resolve_log_file(self):
        logging.debug("Resolving log file location...")
        if self.config.gdrive_log_file is not None:
            return self.config.gdrive_log_file

        remote_metrics_log_file = self.config.remote_metrics_log_file
        file_location_parts = list(filter(None, remote_metrics_log_file.split("/")))
        gdrive_loc = ""
        for part in file_location_parts[::-1]:
            gdrive_loc = f"/{part}{gdrive_loc}"
            if self.gdrive.get(gdrive_loc) is not None:
                logging.info(
                    f"Google Drive log file location resolved as {gdrive_loc}."
                )
                return gdrive_loc

        raise Exception(
            f"Could not resolve log file location from config: {remote_metrics_log_file}"
        )
Ejemplo n.º 12
0
    def hook(self):
        logging.debug("Registering resources utilization")
        GPUs = GPUtil.getGPUs()

        utilization_metrics = {
            "RAM_used_percentage": psutil.virtual_memory().percent,
            "CPU_usage": psutil.cpu_percent(),
        }
        if len(GPUs):
            gpu = GPUs[0]
            utilization_metrics.update(
                {
                    "GPU_mem_free": gpu.memoryFree,
                    "GPU_mem_used": gpu.memoryUsed,
                    "GPU_memb_util_percentage": gpu.memoryUtil * 100,
                    "GPU_mem_total": gpu.memoryTotal,
                }
            )
        self.monitor("pull_metrics", utilization_metrics)
Ejemplo n.º 13
0
 def start(self):
     logging.debug("Starting Google Drive featching thread.")
     if not self.thread_running:
         self.thread = threading.Timer(self.fetch_interval_sec, self._run_thread)
         self.thread.start()
         self.thread_running = True
Ejemplo n.º 14
0
 def clean(self):
     logging.debug("Removing monitored metrics")
     self.monitor_values = defaultdict(list)
Ejemplo n.º 15
0
 def monitor(self, name: str, value):
     logging.debug(f"Receive metric: {name} with value: {value}")
     self.monitor_values[name].append(value)