def restart(self): processes = [ps['group'] for ps in self._rpc.supervisor.getAllProcessInfo()] for ps in processes: self._rpc.supervisor.stopProcessGroup(ps) for ps in processes: self._rpc.supervisor.startProcessGroup(ps) wait_until_true(self._services_up)
def test_cluster_down(self): """ Check Calamari's reaction to total loss of contact with a Ceph cluster being monitored. - The cluster update time should stop getting incremented - The system should recover promptly when the cluster comes back online. """ cluster_id = self._wait_for_cluster() def update_time(): return self.api.get("cluster/%s" % cluster_id).json()['update_time'] # Lose contact with the cluster self.ceph_ctl.go_dark(cluster_id) initial_update_time = update_time() log.debug("Sleeping for %s seconds, don't panic!" % LOSE_CONTACT_TIMEOUT) time.sleep(LOSE_CONTACT_TIMEOUT) # The update time should not have been incremented self.assertEqual(initial_update_time, update_time()) # Regain contact with the cluster self.ceph_ctl.go_dark(cluster_id, dark=False) # The update time should start incrementing again wait_until_true(lambda: update_time() != initial_update_time, timeout=NEXT_HEARTBEAT_TIMEOUT) self.assertNotEqual(initial_update_time, update_time())
def authorize_keys(self, minion_ids): def _fqdns_present(): found_ids = [m['id'] for m in self.api.get("key").json()] all_present = len(set(minion_ids) & set(found_ids)) == len(minion_ids) log.debug("checking keys, looking for %s found %s (%s)" % (minion_ids, found_ids, all_present)) return all_present wait_until_true(_fqdns_present, timeout=KEY_WAIT_PERIOD * len(minion_ids)) for minion_id in minion_ids: log.debug("Authorising key for %s" % minion_id) r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'}) r.raise_for_status()
def test_osd_out(self): """ Check Calamari's reaction to an OSD going down: - The OSD map should be updated - The health information should be updated and indicate a warning """ cluster_id = self._wait_for_cluster() # Pick an OSD and check its initial status osd_id = 0 osd_url = "cluster/{0}/osd/{1}".format(cluster_id, osd_id) # Check it's initially up and in initial_osd_status = self.api.get(osd_url).json() self.assertEqual(initial_osd_status['up'], True) self.assertEqual(initial_osd_status['in'], True) # Cause it to 'spontaneously' (as far as calamari is concerned) # be marked out self.ceph_ctl.mark_osd_in(cluster_id, osd_id, False) # Wait for the status to filter up to the REST API wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True, timeout=NEXT_HEARTBEAT_TIMEOUT) # Wait for the health status to reflect the degradation # NB this is actually a bit racy, because we assume the PGs remain degraded long enough # to affect the health state: in theory they could all get remapped instantaneously, in # which case the cluster would never appear unhealthy and this would be an invalid check. health_url = "cluster/{0}/sync_object/health".format(cluster_id) def check(): status = self.api.get(health_url).json()['overall_status'] log.debug("health status: %s" % status) return self.api.get( health_url).status_code == 200 and status == "HEALTH_WARN" wait_until_true(lambda: check, timeout=NEXT_HEARTBEAT_TIMEOUT) # Bring the OSD back into the cluster self.ceph_ctl.mark_osd_in(cluster_id, osd_id, True) # Wait for the status wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True, timeout=NEXT_HEARTBEAT_TIMEOUT) # Wait for the health # This can take a long time, because it has to wait for PGs to fully recover wait_until_true(lambda: self.api.get(health_url).json()[ 'overall_status'] == "HEALTH_OK", timeout=OSD_RECOVERY_PERIOD * 2)
def authorize_keys(self, minion_ids): def _fqdns_present(): found_ids = [m['id'] for m in self.api.get("key").json()] all_present = len(set(minion_ids) & set(found_ids)) == len(minion_ids) log.debug("checking keys, looking for %s found %s (%s)" % (minion_ids, found_ids, all_present)) return all_present wait_until_true(_fqdns_present, timeout=KEY_WAIT_PERIOD * len(minion_ids)) for minion_id in minion_ids: if self.api.get("key/%s" % minion_id).json()['status'] == 'accepted': # skip already accepted minions (happens running against external calamari instance) log.debug("Key for %s is already authorised" % minion_id) continue log.debug("Authorising key for %s" % minion_id) r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'}) r.raise_for_status()
def test_osd_out(self): """ Check Calamari's reaction to an OSD going down: - The OSD map should be updated - The health information should be updated and indicate a warning """ cluster_id = self._wait_for_cluster() # Pick an OSD and check its initial status osd_id = 0 osd_url = "cluster/{0}/osd/{1}".format(cluster_id, osd_id) # Check it's initially up and in initial_osd_status = self.api.get(osd_url).json() self.assertEqual(initial_osd_status['up'], True) self.assertEqual(initial_osd_status['in'], True) # Cause it to 'spontaneously' (as far as calamari is concerned) # be marked out self.ceph_ctl.mark_osd_in(cluster_id, osd_id, False) # Wait for the status to filter up to the REST API wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True, timeout=NEXT_HEARTBEAT_TIMEOUT) # Wait for the health status to reflect the degradation # NB this is actually a bit racy, because we assume the PGs remain degraded long enough # to affect the health state: in theory they could all get remapped instantaneously, in # which case the cluster would never appear unhealthy and this would be an invalid check. health_url = "cluster/{0}/sync_object/health".format(cluster_id) def check(): status = self.api.get(health_url).json()['overall_status'] log.debug("health status: %s" % status) return self.api.get(health_url).status_code == 200 and status == "HEALTH_WARN" wait_until_true(lambda: check, timeout=NEXT_HEARTBEAT_TIMEOUT) # Bring the OSD back into the cluster self.ceph_ctl.mark_osd_in(cluster_id, osd_id, True) # Wait for the status wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True, timeout=NEXT_HEARTBEAT_TIMEOUT) # Wait for the health # This can take a long time, because it has to wait for PGs to fully recover wait_until_true(lambda: self.api.get(health_url).json()['overall_status'] == "HEALTH_OK", timeout=OSD_RECOVERY_PERIOD * 2)
def start(self): def _is_stale(ps): names = [ "bin/salt-master", "bin/supervisord", "bin/cthulhu-manager", "calamari/manage.py", "bin/carbon-cache.py" ] try: cmdline = ps.cmdline() except psutil.AccessDenied: return False if not cmdline: return False if "bin/python" not in cmdline[0]: return False for c in cmdline: for name in names: if c.endswith(name): log.error("Stale {0} process: {1}".format( name, ps.pid)) return True return False if self._first: log.info("EmbeddedCalamariControl.start: clearing down salt") self._first = False # Clean out the salt master's caches to mitigate any confusion from continually removing # and adding servers with the same FQDNs. erase_paths = [ "dev/var/cache/salt/master/*", "dev/var/run/salt/master/*", "dev/etc/salt/pki/*" ] for path in erase_paths: for f in glob.glob(os.path.join(TREE_ROOT, path)): if os.path.isdir(f): shutil.rmtree(f) else: os.unlink(f) lingering_salt = [ p for p in psutil.get_process_list() if _is_stale(p) ] for p in lingering_salt: log.warn("Killing stale process: %s" % p.pid) p.kill() config_path = os.path.join(TREE_ROOT, "dev/supervisord.conf") assert os.path.exists(config_path) self._ps = subprocess.Popen(["supervisord", "-n", "-c", config_path], cwd=os.path.abspath(TREE_ROOT), stdout=open("supervisord.out.log", 'w'), stderr=open("supervisord.err.log", 'w')) if not self._ps: raise RuntimeError("Failed to launch supervisor") config = ConfigParser() config.read(config_path) xmlrpc_addr = config.get('inet_http_server', 'port') self._rpc = xmlrpclib.ServerProxy("http://%s/RPC2" % xmlrpc_addr) try: # Wait for supervisor to start responding to RPC wait_until_true(self._available) # Wait for all supervisor's children to start wait_until_true(self._services_up) except: # Ensure that failures during startup do not leave a # zombie supervisor process log.error("Exception during setup, killing supervisor") self._ps.send_signal(signal.SIGINT) try: wait_until_true(lambda: self._ps.poll() is not None) except WaitTimeout: log.error("Supervisor isn't dying, sending it KILL") self._ps.send_signal(signal.SIGKILL) self._ps.wait() raise # The calamari REST API goes through a brief period between process # startup and servicing connections wait_until_true(self._api_connectable) # Calamari REST API will return 503s until the backend is fully up # and responding to ZeroRPC requests. wait_until_true(lambda: self.api.get("cluster").status_code != 503, timeout=30) # Because we are embedded, we should act like a fresh instance # and not let any old keys linger self.clear_keys()
def start(self): def _is_stale(ps): names = ["bin/salt-master", "bin/supervisord", "bin/cthulhu-manager", "calamari/manage.py", "bin/carbon-cache.py"] try: cmdline = ps.cmdline() except psutil.AccessDenied: return False if not cmdline: return False if "bin/python" not in cmdline[0]: return False for c in cmdline: for name in names: if c.endswith(name): log.error("Stale {0} process: {1}".format( name, ps.pid )) return True return False if self._first: log.info("EmbeddedCalamariControl.start: clearing down salt") self._first = False # Clean out the salt master's caches to mitigate any confusion from continually removing # and adding servers with the same FQDNs. erase_paths = ["dev/var/cache/salt/master/*", "dev/var/run/salt/master/*", "dev/etc/salt/pki/*"] for path in erase_paths: for f in glob.glob(os.path.join(TREE_ROOT, path)): if os.path.isdir(f): shutil.rmtree(f) else: os.unlink(f) lingering_salt = [p for p in psutil.get_process_list() if _is_stale(p)] for p in lingering_salt: log.warn("Killing stale process: %s" % p.pid) p.kill() config_path = os.path.join(TREE_ROOT, "dev/supervisord.conf") assert os.path.exists(config_path) self._ps = subprocess.Popen( ["supervisord", "-n", "-c", config_path], cwd=os.path.abspath(TREE_ROOT), stdout=open("supervisord.out.log", 'w'), stderr=open("supervisord.err.log", 'w') ) if not self._ps: raise RuntimeError("Failed to launch supervisor") config = ConfigParser() config.read(config_path) xmlrpc_addr = config.get('inet_http_server', 'port') self._rpc = xmlrpclib.ServerProxy("http://%s/RPC2" % xmlrpc_addr) try: # Wait for supervisor to start responding to RPC wait_until_true(self._available) # Wait for all supervisor's children to start wait_until_true(self._services_up) except: # Ensure that failures during startup do not leave a # zombie supervisor process log.error("Exception during setup, killing supervisor") self._ps.send_signal(signal.SIGINT) try: wait_until_true(lambda: self._ps.poll() is not None) except WaitTimeout: log.error("Supervisor isn't dying, sending it KILL") self._ps.send_signal(signal.SIGKILL) self._ps.wait() raise # The calamari REST API goes through a brief period between process # startup and servicing connections wait_until_true(self._api_connectable) # Calamari REST API will return 503s until the backend is fully up # and responding to ZeroRPC requests. wait_until_true(lambda: self.api.get("cluster").status_code != 503, timeout=30) # Because we are embedded, we should act like a fresh instance # and not let any old keys linger self.clear_keys()