def cluster_health_check(self, timeout=None): """ Check overall cluster health. Relying on health reported by CephCluster.get() Args: timeout (int): in seconds. By default timeout value will be scaled based on number of ceph pods in the cluster. This is just a crude number. Its been observed that as the number of pods increases it takes more time for cluster's HEALTH_OK. Returns: bool: True if "HEALTH_OK" else False Raises: CephHealthException: if cluster is not healthy """ # Scale timeout only if user hasn't passed any value timeout = timeout or (10 * len(self.pods)) sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok) if not sample.wait_for_func_status(result=True): raise exceptions.CephHealthException("Cluster health is NOT OK") # This way of checking health of different cluster entities and # raising only CephHealthException is not elegant. # TODO: add an attribute in CephHealthException, called "reason" # which should tell because of which exact cluster entity health # is not ok ? expected_mon_count = self.mon_count expected_mds_count = self.mds_count self.scan_cluster() try: self.mon_health_check(expected_mon_count) except exceptions.MonCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") try: if not expected_mds_count: pass else: self.mds_health_check(expected_mds_count) except exceptions.MDSCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") # check noobaa health if not self.mcg_obj.status: raise exceptions.NoobaaHealthException("Cluster health is NOT OK") # TODO: OSD and MGR health check logger.info("Cluster HEALTH_OK") # This scan is for reconcilation on *.count # because during first scan in this function some of the # pods may not be up and would have set count to lesser number self.scan_cluster() return True
def cluster_health_check(self, timeout=300): """ This would be a comprehensive cluster health check which includes checking pods, external ceph cluster health. raise exceptions.CephHealthException("Cluster health is NOT OK") """ sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok) if not sample.wait_for_func_status(result=True): raise exceptions.CephHealthException("Cluster health is NOT OK") self.wait_for_noobaa_health_ok() self.validate_pvc()
def __exit__(self, exception_type, value, traceback): """ Exit method for context manager Raises: CephHealthException: If no other exception occurred during execution of context manager and HEALTH_ERROR is detected during the monitoring. exception_type: In case of exception raised during processing of the context manager. """ self.health_monitor_enabled = False if self.health_error_status: self.log_error_status() if exception_type: raise exception_type.with_traceback(value, traceback) if self.health_error_status: raise exceptions.CephHealthException( f"During monitoring of Ceph health status hit HEALTH_ERROR: " f"{self.health_error_status}") return True