def noobaa_health_check(self): """ Check Noobaa health """ if not self.mcg_obj.status: raise exceptions.NoobaaHealthException("Cluster health is NOT OK")
def cluster_health_check(self, timeout=None): """ Check overall cluster health. Relying on health reported by CephCluster.get() Args: timeout (int): in seconds. By default timeout value will be scaled based on number of ceph pods in the cluster. This is just a crude number. Its been observed that as the number of pods increases it takes more time for cluster's HEALTH_OK. Returns: bool: True if "HEALTH_OK" else False Raises: CephHealthException: if cluster is not healthy """ # Scale timeout only if user hasn't passed any value timeout = timeout or (10 * len(self.pods)) sample = TimeoutSampler(timeout=timeout, sleep=3, func=self.is_health_ok) if not sample.wait_for_func_status(result=True): raise exceptions.CephHealthException("Cluster health is NOT OK") # This way of checking health of different cluster entities and # raising only CephHealthException is not elegant. # TODO: add an attribute in CephHealthException, called "reason" # which should tell because of which exact cluster entity health # is not ok ? expected_mon_count = self.mon_count expected_mds_count = self.mds_count self.scan_cluster() try: self.mon_health_check(expected_mon_count) except exceptions.MonCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") try: if not expected_mds_count: pass else: self.mds_health_check(expected_mds_count) except exceptions.MDSCountException as e: logger.error(e) raise exceptions.CephHealthException("Cluster health is NOT OK") # check noobaa health if not self.mcg_obj.status: raise exceptions.NoobaaHealthException("Cluster health is NOT OK") # TODO: OSD and MGR health check logger.info("Cluster HEALTH_OK") # This scan is for reconcilation on *.count # because during first scan in this function some of the # pods may not be up and would have set count to lesser number self.scan_cluster() return True