def domain_n_preserve_remove_node_test(self): site = "site1" fake_site, lc = self.make_fake_libcloud_site(site) dt_name = self._load_dtrs(site, fake_site) n = 3 dt = _make_domain_def(n, dt_name, site) def_id = str(uuid.uuid4()) self.epum_client.add_domain_definition(def_id, example_definition) domain_id = str(uuid.uuid4()) log.debug("Launching domain %s", domain_id) self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user) self._wait_states(n, lc, states=[NodeState.RUNNING]) # wait a little while to make sure query thread detects all VMs time.sleep(10) nodes = get_valid_nodes(lc) lc.destroy_node(random.choice(nodes)) wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60) self._wait_remove_domain(domain_id) self._wait_for_all_terminated(lc)
def test_initialized_system_boot_with_procs(self): # tests the case where just the doctor dies in the middle of system boot # but after a doctor has already declared the system initialized. In this # case we have processes in the UNSCHEDULED_PENDING state that should be # rescheduled once system boot ends. self.store.set_system_boot(True) self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING) p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING) self.store.add_process(p2) restartable_procs = ["proc0", "proc1", "proc2"] self._run_in_thread() # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED)
def domain_n_preserve_alter_state_test(self): site = uuid.uuid4().hex fake_site, lc = self.make_fake_libcloud_site(site) dt_name = self._load_dtrs(site, fake_site) n = 3 dt = _make_domain_def(n, dt_name, site) def_id = str(uuid.uuid4()) self.epum_client.add_domain_definition(def_id, example_definition) domain_id = str(uuid.uuid4()) log.debug("Launching domain %s", domain_id) self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user) wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60) nodes = get_valid_nodes(lc) lc.set_node_state(nodes[0], NodeState.TERMINATED) self._wait_states(n, lc, states=[NodeState.RUNNING, NodeState.PENDING]) print "terminating" self._wait_remove_domain(domain_id) # check the node list nodes = lc.list_nodes(immediate=True) for nd in nodes: # verify that any node that is still around is terminated self.assertEqual(nd.state, NodeState.TERMINATED)
def test_uninitialized_system_boot_without_state(self): self.store.set_system_boot(True) self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.SYSTEM_BOOTING) self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)
def _wait_for_domain(self, domain_id): def waiter(): try: domain = self.epum_client.describe_domain(domain_id) return domain is not None except NotFoundError: return False wait(waiter, timeout=30)
def _wait_for_all_terminated(self, lc): def wait_terminated(): nodes = lc.list_nodes(immediate=True) return all(node.state == NodeState.TERMINATED for node in nodes) try: wait(wait_terminated, timeout=60) except wait.TimeOutWaitingFor: nodes = [n.id for n in lc.list_nodes(immediate=True) if n.state != NodeState.TERMINATED] self.fail("Timed out waiting for all nodes to be terminated. Remaining: %s" % nodes)
def _wait_remove_many_domains(self, domains, delay=0.1): domains = set(domains) for domain_id in domains: self.epum_client.remove_domain(domain_id) # wait for intersection between sets to be empty (no domains left) try: wait(lambda: len(domains & set(self.epum_client.list_domains())) == 0, timeout=60, wait=delay) except wait.TimeOutWaitingFor: remaining = domains & set(self.epum_client.list_domains()) self.fail("Timed out waiting for domains to exit. domains: %s" % list(remaining))
def _get_contender(self, path, ndx=0): """returns name, hostname, pid tuple""" assert ndx < self.epum_replica_count contenders = [] election = self.kazoo.Election(path) def getem(): contenders[:] = election.contenders() return len(contenders) == self.epum_replica_count # retry getting contenders. may take them a while to emerge wait(getem, timeout=20) name, hostname, pid = contenders[ndx].split(':') return name, hostname, int(pid)
def test_monitor_thread(self): self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.OK) self.assertIsNotNone(self.doctor.monitor) monitor_thread = self.doctor.monitor_thread self.assertIsNotNone(monitor_thread) self.assertTrue(monitor_thread.is_alive()) # now cancel doctor. monitor should stop too self.doctor.cancel() wait(lambda: not monitor_thread.is_alive())
def test_initialized_system_boot_without_procs(self): # tests the case where just the doctor dies in the middle of system boot # but after a doctor has already declared the system initialized. In this # case we have no processes to schedule on system boot completion. self.store.set_system_boot(True) self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING) self._run_in_thread() # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)
def domain_add_check_n_remove_test(self): site = uuid.uuid4().hex fake_site, lc = self.make_fake_libcloud_site(site) dt_name = self._load_dtrs(site, fake_site) n = 3 dt = _make_domain_def(n, dt_name, site) def_id = str(uuid.uuid4()) self.epum_client.add_domain_definition(def_id, example_definition) domain_id = str(uuid.uuid4()) log.debug("Launching domain %s", domain_id) self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user) wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60) self._wait_remove_domain(domain_id) self._wait_for_all_terminated(lc)
def _wait_states(self, n, lc, states=None, timeout=20, delay=0.1): if states is None: states = [NodeState.RUNNING, NodeState.PENDING] print "Waiting for %d nodes in states: %s" % (n, states) def wait_running_count(): nodes = lc.list_nodes(immediate=True) running_count = 0 found_states = defaultdict(int) for nd in nodes: found_states[nd.state] += 1 if nd.state in states: running_count += 1 print "Found %d nodes in states: %s" % (len(nodes), " ".join("%s:%s" % pair for pair in found_states.iteritems())) return running_count == n wait(wait_running_count, timeout=timeout, wait=delay)
def _add_many_domains_terminate_all(self, kill_func=None, places_to_kill=None, n=1, nodes_per_domain=3): test_pc = 1 def_name = str(uuid.uuid4()) self.epum_client.add_domain_definition(def_name, example_definition) domain = _example_domain(nodes_per_domain) domains_started = [] for i in range(n): name = "dom%d" % (i) self.epum_client.add_domain(name, def_name, domain) domains_started.append(name) test_pc = self._kill_cb(test_pc, places_to_kill, kill_func) domains = self.epum_client.list_domains() domains_started.sort() domains.sort() self.assertEqual(domains, domains_started) self.wait_for_libcloud_nodes(n * nodes_per_domain) test_pc = self._kill_cb(test_pc, places_to_kill, kill_func) self.wait_for_all_domains() state = self.provisioner_client.terminate_all() self.assertFalse(state) # cannot all be terminated this quickly test_pc = self._kill_cb(test_pc, places_to_kill, kill_func) # wait a little while until hopefully termination is underway time.sleep(2) test_pc = self._kill_cb(test_pc, places_to_kill, kill_func) # this will return true when everything is terminated wait(self.provisioner_client.terminate_all, timeout=20) self.assertFalse(self.get_valid_libcloud_nodes()) for name in domains_started: self.epum_client.remove_domain(name) self.wait_for_domain_set([])
def wait_for_domain_set(self, expected, timeout=30): expected = set(expected) wait(lambda: set(self.epum_client.list_domains()) == expected, timeout=timeout)
def test_uninitialized_system_boot_with_state(self): self.store.set_system_boot(True) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ALWAYS) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id) self.store.add_process(p2) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id) self.store.add_process(p3) # this one shouldn't restart p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ABNORMAL) self.store.add_process(p4) # non-running proceses should also potentially be restarted on boot p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING) self.store.add_process(p5) self.store.enqueue_process(*p5.key) p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED) self.store.add_process(p6) # not this one, due to RestartMode p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED, configuration=nosystemrestart_process_config(), restart_mode=RestartMode.ALWAYS) self.store.add_process(p7) self.store.enqueue_process(*p7.key) resource = self.store.get_resource(resource_id) resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key] self.store.update_resource(resource) restartable_procs = ["proc1", "proc2", "proc5", "proc6"] dead_procs = ["proc0", "proc4", "proc7"] self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(len(self.store.get_queued_processes()), 0) self.assertEqual(len(self.store.get_node_ids()), 0) self.assertEqual(len(self.store.get_resource_ids()), 0) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.UNSCHEDULED_PENDING) for proc in dead_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.TERMINATED) self.assertEqual(self.store.get_process(None, "proc3").state, ProcessState.TERMINATED) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.SYSTEM_BOOTING) # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED)
def domain_sensor_engine_test(self): site = uuid.uuid4().hex fake_site, lc = self.make_fake_libcloud_site(site) dt_name = self._load_dtrs(site, fake_site) minimum_n = 1 maximum_n = 3 scale_up_threshold = 2.0 scale_up_n_vms = 1 scale_down_threshold = 0.5 scale_down_n_vms = 1 scale_down_sensor_data = [0, 0, 0] scale_up_sensor_data = [3, 3, 5] metric = 'load' sample_function = 'Average' dt = _make_sensor_domain_def(metric, sample_function, minimum_n, maximum_n, scale_up_threshold, scale_up_n_vms, scale_down_threshold, scale_down_n_vms, scale_down_sensor_data, dt_name, site) def_id = str(uuid.uuid4()) self.epum_client.add_domain_definition(def_id, sensor_definition) domain_id = str(uuid.uuid4()) log.debug("Launching domain %s", domain_id) self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user) # make sure we hit the minimum number of nodes wait(lambda: len(get_valid_nodes(lc)) >= minimum_n, timeout=60) # Now get it to scale up print "reconfiguring with sensor data: %s" % scale_up_sensor_data new_config = {'engine_conf': {'sensor_data': scale_up_sensor_data}} self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user) # make sure we hit the maximum number of nodes wait(lambda: len(get_valid_nodes(lc)) == maximum_n, timeout=60) # Now get it to scale down print "reconfiguring with sensor data: %s" % scale_down_sensor_data new_config = {'engine_conf': {'sensor_data': scale_down_sensor_data}} self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user) wait(lambda: len(get_valid_nodes(lc)) == minimum_n, timeout=60) # Now test the cooldown new_config = { 'engine_conf': { 'sensor_data': scale_up_sensor_data, 'cooldown_period': 100, } } self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user) # Wait 10s for a few decides to happen: time.sleep(10) # And ensure we're still a minimum scaling nodes = get_valid_nodes(lc) self.assertEqual(len(nodes), minimum_n) # Now set cooldown to 10s (which have already passed) new_config = {'engine_conf': {'cooldown_period': 10}} self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user) # And watch it scale up wait(lambda: len(get_valid_nodes(lc)) == maximum_n, timeout=60) self._wait_remove_domain(domain_id) self._wait_for_all_terminated(lc)
def wait_for_libcloud_nodes(self, count, timeout=60): wait(lambda: len(self.get_valid_nodes()) == count, timeout=timeout) return self.get_valid_nodes()
def wait_for_all_domains(self, timeout=30): wait(self.verify_all_domain_instances, timeout=timeout)