def test_mon_down(self): """ Check Calamari's reaction to loss of contact with individual mon servers in a Ceph cluster. - The cluster state should continue to be updated as long as there is a mon quorum and one mon is available to calamari. """ cluster_id = self._wait_for_cluster() mon_fqdns = self.ceph_ctl.get_service_fqdns(cluster_id, 'mon') def update_time(): return self.api.get("cluster/%s" % cluster_id).json()['update_time'] # I don't know which if any of the mons the calamari server # might be preferentially accepting data from, but I want # to ensure that it can survive any of them going away. for mon_fqdn in mon_fqdns: self.ceph_ctl.go_dark(cluster_id, minion_id=mon_fqdn) last_update_time = update_time() # This will give a timeout exception if calamari did not # re establish monitoring after the mon server went offline. try: scalable_wait_until_true( lambda: last_update_time != update_time(), timeout=NEW_FAVORITE_TIMEOUT) except WaitTimeout: self.fail("Failed to recover from killing %s in %s seconds" % (mon_fqdn, NEW_FAVORITE_TIMEOUT)) self.ceph_ctl.go_dark(cluster_id, dark=False, minion_id=mon_fqdn)
def test_mon_down(self): """ Check Calamari's reaction to loss of contact with individual mon servers in a Ceph cluster. - The cluster state should continue to be updated as long as there is a mon quorum and one mon is available to calamari. """ cluster_id = self._wait_for_cluster() mon_fqdns = self.ceph_ctl.get_service_fqdns(cluster_id, 'mon') if len(mon_fqdns) < 3: raise SkipTest("Not enough monitors to test one down") def update_time(): return self.api.get("cluster/%s" % cluster_id).json()['update_time'] # I don't know which if any of the mons the calamari server # might be preferentially accepting data from, but I want # to ensure that it can survive any of them going away. for mon_fqdn in mon_fqdns: self.ceph_ctl.go_dark(cluster_id, minion_id=mon_fqdn) last_update_time = update_time() # This will give a timeout exception if calamari did not # re establish monitoring after the mon server went offline. try: scalable_wait_until_true(lambda: last_update_time != update_time(), timeout=NEW_FAVORITE_TIMEOUT) except WaitTimeout: self.fail("Failed to recover from killing %s in %s seconds" % ( mon_fqdn, NEW_FAVORITE_TIMEOUT)) self.ceph_ctl.go_dark(cluster_id, dark=False, minion_id=mon_fqdn)
def _wait_for_completion(self, fsid, response, timeout=None): """ Wait for a user request to complete successfully, given the response from a PATCH/POST/DELETE """ if timeout is None: timeout = REQUEST_TIMEOUT self.assertEqual(response.status_code, 202) request_id = response.json()['request_id'] scalable_wait_until_true(lambda: self._request_complete(fsid, request_id), timeout=timeout)
def _wait_for_completion(self, fsid, response, timeout=None): """ Wait for a user request to complete successfully, given the response from a PATCH/POST/DELETE """ if timeout is None: timeout = REQUEST_TIMEOUT self.assertEqual(response.status_code, 202) request_id = response.json()['request_id'] scalable_wait_until_true( lambda: self._request_complete(fsid, request_id), timeout=timeout)
def _wait_for_servers(self): """ Wait for all the expected servers to appear in the REST API """ expected_servers = set(self.ceph_ctl.get_server_fqdns()) def servers_available(): servers = self.api.get("server").json() managed_servers = [s for s in servers if s['managed']] ready = set([s['fqdn'] for s in managed_servers]) == expected_servers if not ready: log.debug("_wait_for_servers: {0} ({1} managed) servers visible vs. {2} expected".format( len(servers), len(managed_servers), len(expected_servers) )) return ready scalable_wait_until_true(servers_available, timeout=30)
def _wait_for_cluster(self, cluster_count=1): """ Return an ID if cluster_count is 1, else return a list of IDs. """ self.calamari_ctl.authorize_keys(self.ceph_ctl.get_server_fqdns()) log.debug("Authorized keys") # Once I've authorized the keys, the first mon to retry its salt authentication # will cause the cluster to get noticed. salt_auth_retry_interval = 10 scalable_wait_until_true(lambda: self._cluster_detected(cluster_count), timeout=salt_auth_retry_interval * 2) log.debug("Detected cluster") if cluster_count == 1: cluster_id = self.api.get("cluster").json()[0]['id'] scalable_wait_until_true(lambda: self._maps_populated(cluster_id)) return cluster_id else: result = [] for cluster in self.api.get("cluster").json(): scalable_wait_until_true( lambda: self._maps_populated(cluster['id'])) result.append(cluster['id']) return result
def _wait_for_cluster(self, cluster_count=1): """ Return an ID if cluster_count is 1, else return a list of IDs. """ self.calamari_ctl.authorize_keys(self.ceph_ctl.get_server_fqdns()) log.debug("Authorized keys") # Once I've authorized the keys, the first mon to retry its salt authentication # will cause the cluster to get noticed. salt_auth_retry_interval = 10 scalable_wait_until_true(lambda: self._cluster_detected(cluster_count), timeout=salt_auth_retry_interval * 2) log.debug("Detected cluster") if cluster_count == 1: cluster_id = self.api.get("cluster").json()[0]['id'] scalable_wait_until_true(lambda: self._maps_populated(cluster_id)) return cluster_id else: result = [] for cluster in self.api.get("cluster").json(): scalable_wait_until_true(lambda: self._maps_populated(cluster['id'])) result.append(cluster['id']) return result
def _wait_for_request(self, request_id, timeout=None, check=True): """ :param check: If true, we raise an exception on requests that fail """ scalable_wait_until_true(lambda: self._request_complete(request_id, check=check), timeout=timeout)