Example #1
0
    def test_mon_down(self):
        """
        Check Calamari's reaction to loss of contact with
        individual mon servers in a Ceph cluster.

        - The cluster state should continue to be updated
          as long as there is a mon quorum and
          one mon is available to calamari.
        """
        cluster_id = self._wait_for_cluster()
        mon_fqdns = self.ceph_ctl.get_service_fqdns(cluster_id, 'mon')
        if len(mon_fqdns) < 3:
            raise SkipTest("Not enough monitors to test one down")

        def update_time():
            return self.api.get("cluster/%s" % cluster_id).json()['update_time']

        # I don't know which if any of the mons the calamari server
        # might be preferentially accepting data from, but I want
        # to ensure that it can survive any of them going away.
        for mon_fqdn in mon_fqdns:
            self.ceph_ctl.go_dark(cluster_id, minion_id=mon_fqdn)
            last_update_time = update_time()

            # This will give a timeout exception if calamari did not
            # re establish monitoring after the mon server went offline.
            try:
                scalable_wait_until_true(lambda: last_update_time != update_time(), timeout=NEW_FAVORITE_TIMEOUT)
            except WaitTimeout:
                self.fail("Failed to recover from killing %s in %s seconds" % (
                    mon_fqdn, NEW_FAVORITE_TIMEOUT))

            self.ceph_ctl.go_dark(cluster_id, dark=False, minion_id=mon_fqdn)
Example #2
0
 def _wait_for_completion(self, fsid, response, timeout=None):
     """
     Wait for a user request to complete successfully, given the response from a PATCH/POST/DELETE
     """
     if timeout is None:
         timeout = REQUEST_TIMEOUT
     self.assertEqual(response.status_code, 202)
     request_id = response.json()['request_id']
     scalable_wait_until_true(lambda: self._request_complete(fsid, request_id), timeout=timeout)
Example #3
0
    def _wait_for_servers(self):
        """
        Wait for all the expected servers to appear in the REST API
        """
        expected_servers = set(self.ceph_ctl.get_server_fqdns())

        def servers_available():
            servers = self.api.get("server").json()
            managed_servers = [s for s in servers if s['managed']]
            ready = set([s['fqdn'] for s in managed_servers]) == expected_servers
            if not ready:
                log.debug("_wait_for_servers: {0} ({1} managed) servers visible vs. {2} expected".format(
                    len(servers), len(managed_servers), len(expected_servers)
                ))
            return ready

        scalable_wait_until_true(servers_available, timeout=30)
Example #4
0
    def _wait_for_cluster(self, cluster_count=1):
        """
        Return an ID if cluster_count is 1, else return a list of IDs.
        """
        self.calamari_ctl.authorize_keys(self.ceph_ctl.get_server_fqdns())
        log.debug("Authorized keys")

        # Once I've authorized the keys, the first mon to retry its salt authentication
        # will cause the cluster to get noticed.
        salt_auth_retry_interval = 10

        scalable_wait_until_true(lambda: self._cluster_detected(cluster_count), timeout=salt_auth_retry_interval * 2)
        log.debug("Detected cluster")

        if cluster_count == 1:
            cluster_id = self.api.get("cluster").json()[0]['id']
            scalable_wait_until_true(lambda: self._maps_populated(cluster_id))
            return cluster_id
        else:
            result = []
            for cluster in self.api.get("cluster").json():
                scalable_wait_until_true(lambda: self._maps_populated(cluster['id']))
                result.append(cluster['id'])
            return result
Example #5
0
 def _wait_for_request(self, request_id, timeout=None, check=True):
     """
     :param check: If true, we raise an exception on requests that fail
     """
     scalable_wait_until_true(lambda: self._request_complete(request_id, check=check), timeout=timeout)