Example #1
0
    def domain_n_preserve_remove_node_test(self):
        site = "site1"
        fake_site, lc = self.make_fake_libcloud_site(site)
        dt_name = self._load_dtrs(site, fake_site)

        n = 3
        dt = _make_domain_def(n, dt_name, site)
        def_id = str(uuid.uuid4())
        self.epum_client.add_domain_definition(def_id, example_definition)
        domain_id = str(uuid.uuid4())
        log.debug("Launching domain %s", domain_id)

        self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user)
        self._wait_states(n, lc, states=[NodeState.RUNNING])

        # wait a little while to make sure query thread detects all VMs
        time.sleep(10)

        nodes = get_valid_nodes(lc)
        lc.destroy_node(random.choice(nodes))

        wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60)

        self._wait_remove_domain(domain_id)
        self._wait_for_all_terminated(lc)
Example #2
0
    def test_initialized_system_boot_with_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have processes in the UNSCHEDULED_PENDING state that should be
        # rescheduled once system boot ends.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p2)

        restartable_procs = ["proc0", "proc1", "proc2"]
        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)
Example #3
0
    def domain_n_preserve_alter_state_test(self):

        site = uuid.uuid4().hex
        fake_site, lc = self.make_fake_libcloud_site(site)
        dt_name = self._load_dtrs(site, fake_site)

        n = 3
        dt = _make_domain_def(n, dt_name, site)
        def_id = str(uuid.uuid4())
        self.epum_client.add_domain_definition(def_id, example_definition)
        domain_id = str(uuid.uuid4())
        log.debug("Launching domain %s", domain_id)

        self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user)
        wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60)

        nodes = get_valid_nodes(lc)
        lc.set_node_state(nodes[0], NodeState.TERMINATED)

        self._wait_states(n, lc, states=[NodeState.RUNNING, NodeState.PENDING])

        print "terminating"
        self._wait_remove_domain(domain_id)

        # check the node list
        nodes = lc.list_nodes(immediate=True)
        for nd in nodes:
            # verify that any node that is still around is terminated
            self.assertEqual(nd.state, NodeState.TERMINATED)
Example #4
0
    def test_uninitialized_system_boot_without_state(self):
        self.store.set_system_boot(True)
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)
        self.store.set_system_boot(False)
        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)
Example #5
0
    def _wait_for_domain(self, domain_id):
        def waiter():
            try:
                domain = self.epum_client.describe_domain(domain_id)
                return domain is not None
            except NotFoundError:
                return False

        wait(waiter, timeout=30)
Example #6
0
    def _wait_for_all_terminated(self, lc):
        def wait_terminated():
            nodes = lc.list_nodes(immediate=True)
            return all(node.state == NodeState.TERMINATED for node in nodes)
        try:
            wait(wait_terminated, timeout=60)
        except wait.TimeOutWaitingFor:
            nodes = [n.id for n in lc.list_nodes(immediate=True) if n.state != NodeState.TERMINATED]

            self.fail("Timed out waiting for all nodes to be terminated. Remaining: %s" % nodes)
 def _wait_remove_many_domains(self, domains, delay=0.1):
     domains = set(domains)
     for domain_id in domains:
         self.epum_client.remove_domain(domain_id)
     # wait for intersection between sets to be empty (no domains left)
     try:
         wait(lambda: len(domains & set(self.epum_client.list_domains())) == 0, timeout=60, wait=delay)
     except wait.TimeOutWaitingFor:
         remaining = domains & set(self.epum_client.list_domains())
         self.fail("Timed out waiting for domains to exit. domains: %s" % list(remaining))
Example #8
0
    def _get_contender(self, path, ndx=0):
        """returns name, hostname, pid tuple"""

        assert ndx < self.epum_replica_count
        contenders = []
        election = self.kazoo.Election(path)

        def getem():
            contenders[:] = election.contenders()
            return len(contenders) == self.epum_replica_count
        # retry getting contenders. may take them a while to emerge
        wait(getem, timeout=20)
        name, hostname, pid = contenders[ndx].split(':')
        return name, hostname, int(pid)
Example #9
0
    def test_monitor_thread(self):
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

        self.assertIsNotNone(self.doctor.monitor)
        monitor_thread = self.doctor.monitor_thread
        self.assertIsNotNone(monitor_thread)
        self.assertTrue(monitor_thread.is_alive())

        # now cancel doctor. monitor should stop too
        self.doctor.cancel()
        wait(lambda: not monitor_thread.is_alive())
Example #10
0
    def test_initialized_system_boot_without_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have no processes to schedule on system boot completion.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)
Example #11
0
    def domain_add_check_n_remove_test(self):
        site = uuid.uuid4().hex
        fake_site, lc = self.make_fake_libcloud_site(site)
        dt_name = self._load_dtrs(site, fake_site)

        n = 3
        dt = _make_domain_def(n, dt_name, site)
        def_id = str(uuid.uuid4())
        self.epum_client.add_domain_definition(def_id, example_definition)
        domain_id = str(uuid.uuid4())

        log.debug("Launching domain %s", domain_id)

        self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user)
        wait(lambda: len(get_valid_nodes(lc)) == n, timeout=60)

        self._wait_remove_domain(domain_id)
        self._wait_for_all_terminated(lc)
Example #12
0
    def _wait_states(self, n, lc, states=None, timeout=20, delay=0.1):
        if states is None:
            states = [NodeState.RUNNING, NodeState.PENDING]

        print "Waiting for %d nodes in states: %s" % (n, states)

        def wait_running_count():
            nodes = lc.list_nodes(immediate=True)
            running_count = 0
            found_states = defaultdict(int)
            for nd in nodes:
                found_states[nd.state] += 1
                if nd.state in states:
                    running_count += 1
            print "Found %d nodes in states: %s" % (len(nodes),
                " ".join("%s:%s" % pair for pair in found_states.iteritems()))
            return running_count == n

        wait(wait_running_count, timeout=timeout, wait=delay)
    def _add_many_domains_terminate_all(self, kill_func=None, places_to_kill=None, n=1, nodes_per_domain=3):
        test_pc = 1
        def_name = str(uuid.uuid4())
        self.epum_client.add_domain_definition(def_name, example_definition)

        domain = _example_domain(nodes_per_domain)
        domains_started = []
        for i in range(n):
            name = "dom%d" % (i)
            self.epum_client.add_domain(name, def_name, domain)
            domains_started.append(name)

        test_pc = self._kill_cb(test_pc, places_to_kill, kill_func)

        domains = self.epum_client.list_domains()
        domains_started.sort()
        domains.sort()
        self.assertEqual(domains, domains_started)

        self.wait_for_libcloud_nodes(n * nodes_per_domain)
        test_pc = self._kill_cb(test_pc, places_to_kill, kill_func)

        self.wait_for_all_domains()

        state = self.provisioner_client.terminate_all()
        self.assertFalse(state)  # cannot all be terminated this quickly

        test_pc = self._kill_cb(test_pc, places_to_kill, kill_func)

        # wait a little while until hopefully termination is underway
        time.sleep(2)
        test_pc = self._kill_cb(test_pc, places_to_kill, kill_func)

        # this will return true when everything is terminated
        wait(self.provisioner_client.terminate_all, timeout=20)

        self.assertFalse(self.get_valid_libcloud_nodes())

        for name in domains_started:
            self.epum_client.remove_domain(name)
        self.wait_for_domain_set([])
Example #14
0
    def wait_for_domain_set(self, expected, timeout=30):
        expected = set(expected)

        wait(lambda: set(self.epum_client.list_domains()) == expected,
            timeout=timeout)
Example #15
0
    def test_uninitialized_system_boot_with_state(self):
        self.store.set_system_boot(True)
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING,
                assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING,
            assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING,
            assigned=resource_id)
        self.store.add_process(p3)

        # this one shouldn't restart
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ABNORMAL)
        self.store.add_process(p4)

        # non-running proceses should also potentially be restarted on boot
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING)
        self.store.add_process(p5)
        self.store.enqueue_process(*p5.key)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED)
        self.store.add_process(p6)

        # not this one, due to RestartMode
        p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED,
            configuration=nosystemrestart_process_config(),
            restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p7)
        self.store.enqueue_process(*p7.key)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key]
        self.store.update_resource(resource)

        restartable_procs = ["proc1", "proc2", "proc5", "proc6"]
        dead_procs = ["proc0", "proc4", "proc7"]

        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)

        self.assertEqual(len(self.store.get_queued_processes()), 0)
        self.assertEqual(len(self.store.get_node_ids()), 0)
        self.assertEqual(len(self.store.get_resource_ids()), 0)

        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.UNSCHEDULED_PENDING)
        for proc in dead_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.TERMINATED)
        self.assertEqual(self.store.get_process(None, "proc3").state,
                         ProcessState.TERMINATED)

        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)
Example #16
0
    def domain_sensor_engine_test(self):
        site = uuid.uuid4().hex
        fake_site, lc = self.make_fake_libcloud_site(site)
        dt_name = self._load_dtrs(site, fake_site)

        minimum_n = 1
        maximum_n = 3
        scale_up_threshold = 2.0
        scale_up_n_vms = 1
        scale_down_threshold = 0.5
        scale_down_n_vms = 1
        scale_down_sensor_data = [0, 0, 0]
        scale_up_sensor_data = [3, 3, 5]
        metric = 'load'
        sample_function = 'Average'
        dt = _make_sensor_domain_def(metric, sample_function, minimum_n,
                maximum_n, scale_up_threshold,
                scale_up_n_vms, scale_down_threshold, scale_down_n_vms,
                scale_down_sensor_data, dt_name, site)
        def_id = str(uuid.uuid4())
        self.epum_client.add_domain_definition(def_id, sensor_definition)
        domain_id = str(uuid.uuid4())
        log.debug("Launching domain %s", domain_id)

        self.epum_client.add_domain(domain_id, def_id, dt, caller=self.user)

        # make sure we hit the minimum number of nodes
        wait(lambda: len(get_valid_nodes(lc)) >= minimum_n, timeout=60)

        # Now get it to scale up
        print "reconfiguring with sensor data: %s" % scale_up_sensor_data
        new_config = {'engine_conf': {'sensor_data': scale_up_sensor_data}}
        self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user)

        # make sure we hit the maximum number of nodes
        wait(lambda: len(get_valid_nodes(lc)) == maximum_n, timeout=60)

        # Now get it to scale down
        print "reconfiguring with sensor data: %s" % scale_down_sensor_data
        new_config = {'engine_conf': {'sensor_data': scale_down_sensor_data}}
        self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user)

        wait(lambda: len(get_valid_nodes(lc)) == minimum_n, timeout=60)

        # Now test the cooldown
        new_config = {
            'engine_conf': {
                'sensor_data': scale_up_sensor_data,
                'cooldown_period': 100,
            }
        }
        self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user)

        # Wait 10s for a few decides to happen:
        time.sleep(10)

        # And ensure we're still a minimum scaling
        nodes = get_valid_nodes(lc)
        self.assertEqual(len(nodes), minimum_n)

        # Now set cooldown to 10s (which have already passed)
        new_config = {'engine_conf': {'cooldown_period': 10}}
        self.epum_client.reconfigure_domain(domain_id, new_config, caller=self.user)

        # And watch it scale up
        wait(lambda: len(get_valid_nodes(lc)) == maximum_n, timeout=60)

        self._wait_remove_domain(domain_id)

        self._wait_for_all_terminated(lc)
Example #17
0
 def wait_for_libcloud_nodes(self, count, timeout=60):
     wait(lambda: len(self.get_valid_nodes()) == count,
         timeout=timeout)
     return self.get_valid_nodes()
Example #18
0
 def wait_for_all_domains(self, timeout=30):
     wait(self.verify_all_domain_instances, timeout=timeout)