class TestAgentScheduler(unittest.TestCase):

    def tearDown(self):
        self.client.close()
        self.control_client.close()
        self.runtime.cleanup()

    def setUp(self):
        self.runtime = RuntimeUtils(self.id())
        self.config = self.runtime.get_default_agent_config()
        res = self.runtime.start_agent(self.config)
        (self.proc, self.client, self.control_client) = res

    def test_demote_agent_from_leaf_scheduler(self):

        request = GetSchedulersRequest()
        response = self.control_client.get_schedulers(request)

        # Agent starts without any schedulers
        assert_that(len(response.schedulers), is_(0))

        # Configure the agent with a leaf scheduler
        leafId1 = stable_uuid("leaf scheduler")
        config_req = Host.GetConfigRequest()
        host_config = self.client.get_host_config(config_req).hostConfig

        leaf_scheduler = SchedulerRole(leafId1)
        leaf_scheduler.parent_id = stable_uuid("parent scheduler")
        leaf_scheduler.hosts = [host_config.agent_id]
        leaf_scheduler.host_children = [ChildInfo(id=host_config.agent_id,
                                                  address="localhost",
                                                  port=8835)]

        config_request = ConfigureRequest(
            leafId1,
            Roles([leaf_scheduler]))

        self.client.configure(config_request)

        request = GetSchedulersRequest()
        response = self.control_client.get_schedulers(request)

        # Verify that the agent has been configured
        assert_that(len(response.schedulers), is_(1))
        assert_that(response.schedulers[0].role.id, is_(leafId1))

        # Demote agent from leaf scheduler
        config_request = ConfigureRequest(
            leafId1,
            Roles([]))
        self.client.configure(config_request)

        # Verify that the agent isn't a scheduler
        request = GetSchedulersRequest()
        response = self.control_client.get_schedulers(request)
        assert_that(len(response.schedulers), is_(0))
Ejemplo n.º 2
0
class TestAgent(unittest.TestCase, AgentCommonTests):
    def shortDescription(self):
        return None

    def configure_host(self):
        config_req = Host.GetConfigRequest()
        host_config = self.host_client.get_host_config(config_req).hostConfig

        leaf_scheduler = SchedulerRole(stable_uuid("leaf scheduler"))
        leaf_scheduler.parent_id = stable_uuid("parent scheduler")
        leaf_scheduler.hosts = [host_config.agent_id]

        config_request = ConfigureRequest(
            stable_uuid("leaf scheduler"),
            Roles([leaf_scheduler]))

        self.host_client.configure(config_request)

    def test_bootstrap(self):
        # Verify the negative case first as it has no sideeffect.
        self._update_agent_invalid_config()
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        # Don't set availability zone since
        # self._update_agent_config() sets it.
        del new_config["--availability-zone"]
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        req = self._update_agent_config()

        # Start back the agent and verify that the config seen by the agent is
        # the same we requested for.
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        self._validate_post_boostrap_config(req)

    @property
    def agent_in_uwsim(self):
        return False

    def setUp(self):
        self.runtime = RuntimeUtils(self.id())
        self.config = self.runtime.get_default_agent_config()
        self._datastores = ["datastore1", "datastore2"]
        self.config["--datastores"] = ",".join(self._datastores)
        res = self.runtime.start_agent(self.config)
        self.proc, self.host_client, self.control_client = res
        self.configure_host()
        self.set_host_mode(HostMode.NORMAL)
        self.clear_datastore_tags()

    def tearDown(self):
        self.runtime.cleanup()

    def test_agent_with_invalid_vsi(self):
        self.runtime.stop_agent(self.proc)
        # Since the config file takes precedence over the command line
        # options we need to remove the directory that was created
        # from the start_agent in the setup method.
        shutil.rmtree(self.config["--config-path"])
        new_config = self.config.copy()
        new_config["--hypervisor"] = "esx"
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        time_waited = 0
        while self._agent_running() and time_waited < 5:
            time.sleep(0.2)
            time_waited += 0.2
        self.assertFalse(self._agent_running())

    def test_datastore_parse(self):
        """Test that the agent parses datastore args"""
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        new_config["--datastores"] = " ds1,ds2, ds3, ds4 "
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res

        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        datastore_ids = [ds.id for ds in response.hostConfig.datastores]
        expected_ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
                        for name in ["ds1", "ds2", "ds3", "ds4"]]
        assert_that(datastore_ids, equal_to(expected_ids))

    def test_management_only_parse(self):
        # Default option for management_only is False
        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        assert_that(response.result, equal_to(Host.GetConfigResultCode.OK))
        assert_that(response.hostConfig.management_only, equal_to(False))

        # Restart agent with --management-only option, test the flag is set
        # to True
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        new_config["--management-only"] = None
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res

        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        assert_that(response.result, equal_to(Host.GetConfigResultCode.OK))
        assert_that(response.hostConfig.management_only, equal_to(True))

    def test_create_vm_with_ephemeral_disks_ttylinux(self):
        self._test_create_vm_with_ephemeral_disks("ttylinux")

    def test_create_vm_with_ephemeral_disks(self):
        image_dir = os.path.join(
            "/tmp/images",
            FakeHypervisor.datastore_id(self.get_image_datastore()))

        try:
            mkdir_p(image_dir)
            with tempfile.NamedTemporaryFile(dir=image_dir,
                                             suffix=".vmdk") as f:
                # The temp file name created is
                # "/tmp/image/<ds>/<uniquepart>.vmdk".
                # This simulates an image being present on the agent,
                # The file is deleted on leaving the context.
                image_id = f.name[f.name.rfind("/") + 1:-5]
                self._test_create_vm_with_ephemeral_disks(image_id)
        finally:
            rm_rf(image_dir)

    def _agent_running(self):
        if not self.proc:
            return False

        try:
            os.kill(self.proc.pid, 0)
            return True
        except OSError as e:
            if e.errno == errno.ESRCH:
                return False
            elif e.errno == errno.EPERM:
                return True
            else:
                raise e
Ejemplo n.º 3
0
class TestAgent(unittest.TestCase, AgentCommonTests):
    def shortDescription(self):
        return None

    def configure_host(self):
        config_req = Host.GetConfigRequest()
        host_config = self.host_client.get_host_config(config_req).hostConfig

        leaf_scheduler = SchedulerRole(stable_uuid("leaf scheduler"))
        leaf_scheduler.parent_id = stable_uuid("parent scheduler")
        leaf_scheduler.hosts = [host_config.agent_id]

        config_request = ConfigureRequest(stable_uuid("leaf scheduler"),
                                          Roles([leaf_scheduler]))

        self.host_client.configure(config_request)

    def test_bootstrap(self):
        # Verify the negative case first as it has no sideeffect.
        self._update_agent_invalid_config()
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        # Don't set availability zone since
        # self._update_agent_config() sets it.
        del new_config["--availability-zone"]
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        req = self._update_agent_config()

        # Start back the agent and verify that the config seen by the agent is
        # the same we requested for.
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        self._validate_post_boostrap_config(req)

    @property
    def agent_in_uwsim(self):
        return False

    def setUp(self):
        self.runtime = RuntimeUtils(self.id())
        self.config = self.runtime.get_default_agent_config()
        self._datastores = ["datastore1", "datastore2"]
        self.config["--datastores"] = ",".join(self._datastores)
        res = self.runtime.start_agent(self.config)
        self.proc, self.host_client, self.control_client = res
        self.configure_host()
        self.set_host_mode(HostMode.NORMAL)
        self.clear_datastore_tags()

    def tearDown(self):
        self.runtime.cleanup()

    def test_agent_with_invalid_vsi(self):
        self.runtime.stop_agent(self.proc)
        # Since the config file takes precedence over the command line
        # options we need to remove the directory that was created
        # from the start_agent in the setup method.
        shutil.rmtree(self.config["--config-path"])
        new_config = self.config.copy()
        new_config["--hypervisor"] = "esx"
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res
        time_waited = 0
        while self._agent_running() and time_waited < 5:
            time.sleep(0.2)
            time_waited += 0.2
        self.assertFalse(self._agent_running())

    def test_datastore_parse(self):
        """Test that the agent parses datastore args"""
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        new_config["--datastores"] = " ds1,ds2, ds3, ds4 "
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res

        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        datastore_ids = [ds.id for ds in response.hostConfig.datastores]
        expected_ids = [
            str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
            for name in ["ds1", "ds2", "ds3", "ds4"]
        ]
        assert_that(datastore_ids, equal_to(expected_ids))

    def test_management_only_parse(self):
        # Default option for management_only is False
        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        assert_that(response.result, equal_to(Host.GetConfigResultCode.OK))
        assert_that(response.hostConfig.management_only, equal_to(False))

        # Restart agent with --management-only option, test the flag is set
        # to True
        self.runtime.stop_agent(self.proc)
        new_config = self.config.copy()
        new_config["--management-only"] = None
        res = self.runtime.start_agent(new_config)
        self.proc, self.host_client, self.control_client = res

        request = Host.GetConfigRequest()
        response = self.host_client.get_host_config(request)

        assert_that(response.result, equal_to(Host.GetConfigResultCode.OK))
        assert_that(response.hostConfig.management_only, equal_to(True))

    def test_create_vm_with_ephemeral_disks_ttylinux(self):
        self._test_create_vm_with_ephemeral_disks("ttylinux")

    def test_create_vm_with_ephemeral_disks(self):
        image_dir = os.path.join(
            "/tmp/images",
            FakeHypervisor.datastore_id(self.get_image_datastore()))

        try:
            mkdir_p(image_dir)
            with tempfile.NamedTemporaryFile(dir=image_dir,
                                             suffix=".vmdk") as f:
                # The temp file name created is
                # "/tmp/image/<ds>/<uniquepart>.vmdk".
                # This simulates an image being present on the agent,
                # The file is deleted on leaving the context.
                image_id = f.name[f.name.rfind("/") + 1:-5]
                self._test_create_vm_with_ephemeral_disks(image_id)
        finally:
            rm_rf(image_dir)

    def _agent_running(self):
        if not self.proc:
            return False

        try:
            os.kill(self.proc.pid, 0)
            return True
        except OSError as e:
            if e.errno == errno.ESRCH:
                return False
            elif e.errno == errno.EPERM:
                return True
            else:
                raise e
class TestTreeIntrospection(BaseKazooTestCase):
    def setUp(self):
        self.set_up_kazoo_base()
        self.zk_client = self._get_nonchroot_client()
        self.zk_client.start()

        self.runtime = RuntimeUtils(self.id())

        # Create zk paths
        self.zk_client.create(MISSING_PREFIX)
        self.zk_client.create(HOSTS_PREFIX)
        self.zk_client.create(ROLES_PREFIX)

        self.root_conf = {}
        self.root_conf["healthcheck"] = {}
        self.root_conf["zookeeper"] = {}
        self.root_conf["zookeeper"]["quorum"] = "localhost:%i" % (DEFAULT_ZK_PORT,)
        self.root_conf["healthcheck"]["timeout_ms"] = ROOT_SCHEDULER_TIME_OUT
        self.root_conf["healthcheck"]["period_ms"] = ROOT_SCHEDULER_PERIOD

        # start root scheduler
        self.root_host = "localhost"
        self.root_port = 15000
        self.root_conf["bind"] = self.root_host
        self.root_conf["port"] = self.root_port
        self.runtime.start_root_scheduler(self.root_conf)

        (self.root_transport, self.root_sch_client) = create_root_client(self.root_port, self.root_host)

        # start chairman
        self.chairman_host = "localhost"
        self.chairman_port = 13000
        self.leaf_fanout = 2
        self.runtime.start_chairman(self.chairman_host, self.chairman_port, self.leaf_fanout)
        (self.chairman_transport, self.chairman_client) = create_chairman_client(self.chairman_host, self.chairman_port)
        # Wait for chairman and root scheduler to finish their elections
        _wait_on_code(self.root_sch_client.get_schedulers, GetSchedulersResultCode.OK)
        _wait_on_code(self.chairman_client.get_schedulers, GetSchedulersResultCode.OK, GetSchedulersRequest)

    def tearDown(self):
        self.runtime.cleanup()
        self.zk_client.stop()
        self.zk_client.close()
        self.tear_down_kazoo_base()

    def test_get_service_leader(self):
        """Test get service leader"""
        # Check the chairman leader
        (address, port) = get_service_leader(self.zk_client, CHAIRMAN_SERVICE)
        assert_that(address, is_(self.chairman_host))
        assert_that(port, is_(self.chairman_port))

        deleted = threading.Event()

        def _deleted(children):
            if not children:
                deleted.set()

        self.zk_client.ChildrenWatch(CHAIRMAN_SERVICE, _deleted)
        # Stop chairman
        stop_service(self.runtime.chairman_procs[0])
        # Wait for the leader to leave
        deleted.wait(30)
        res = get_service_leader(self.zk_client, CHAIRMAN_SERVICE)
        assert_that(res, is_(None))

    def test_get_root_scheduler(self):
        """Test root scheduler introspection"""
        (root_host, root_port) = get_service_leader(self.zk_client, ROOT_SCHEDULER_SERVICE)
        # Verify that an empty root scheduler is constructed
        # correctly
        root_sch = get_root_scheduler(root_host, root_port)

        assert_that(root_sch.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_sch.type, is_(ROOT_SCHEDULER_TYPE))
        assert_that(len(root_sch.children), is_(0))
        assert_that(root_sch.owner, not_none())
        root_owner = root_sch.owner
        assert_that(root_owner.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_owner.address, is_(root_host))
        assert_that(root_owner.port, is_(root_port))
        assert_that(root_owner.parent, is_(None))

        # Start an agent
        agent_host = "localhost"
        agent_port = 20000
        config = self.runtime.get_agent_config(agent_host, agent_port, self.chairman_host, self.chairman_port)
        res = self.runtime.start_agent(config)
        agent_client = res[1]

        # Wait for the root scheduler to be configured
        _wait_for_configuration(self.root_sch_client, 1)

        new_root_sch = get_root_scheduler(root_host, root_port)
        assert_that(len(new_root_sch.children), is_(1))

        req = THost.GetConfigRequest()
        agent_id = agent_client.get_host_config(req).hostConfig.agent_id

        leaf = new_root_sch.children.values()[0]
        assert_that(leaf.type, is_(LEAF_SCHEDULER_TYPE))
        assert_that(leaf.parent, is_(new_root_sch))
        assert_that(len(leaf.children), is_(0))
        assert_that(leaf.owner.id, is_(agent_id))
        assert_that(leaf.owner.address, is_(agent_host))
        assert_that(leaf.owner.port, is_(agent_port))
        assert_that(leaf.owner.parent, is_(leaf))

        deleted = threading.Event()

        def _deleted(children):
            if not children:
                deleted.set()

        self.zk_client.ChildrenWatch(ROOT_SCHEDULER_SERVICE, _deleted)
        stop_service(self.runtime.root_procs[0])
        # Wait for the leader to leave
        deleted.wait(30)

        emoty_root = get_root_scheduler(root_host, root_port)
        assert_that(emoty_root, is_(emoty_root))

    def test_get_leaf_scheduler(self):
        """Test agent introspection"""

        agent_host = "localhost"
        agent_port = 20000

        # Agent not online
        leaf = get_leaf_scheduler(agent_host, agent_port)
        assert_that(leaf, is_(None))

        # Start an agent with an invalid chairman, so that it doesn't
        # get configured, because we want to configure it manually
        config = self.runtime.get_agent_config(agent_host, agent_port, "localhost", 24234)
        res = self.runtime.start_agent(config)
        agent_client = res[1]

        # Agent is online but not a leaf scheduler
        leaf = get_leaf_scheduler(agent_host, agent_port)
        assert_that(leaf, is_(None))

        leafId1 = stable_uuid("leaf scheduler")
        config_req = THost.GetConfigRequest()
        host_config = agent_client.get_host_config(config_req).hostConfig

        leaf_scheduler = SchedulerRole(leafId1)
        leaf_scheduler.parent_id = stable_uuid("parent scheduler")
        leaf_scheduler.hosts = [host_config.agent_id]
        leaf_scheduler.host_children = [ChildInfo(id=host_config.agent_id, address=agent_host, port=agent_port)]
        config_request = ConfigureRequest(leafId1, Roles([leaf_scheduler]))

        resp = agent_client.configure(config_request)
        assert_that(resp.result, is_(ConfigureResultCode.OK))

        leaf = get_leaf_scheduler(agent_host, agent_port)

        assert_that(leaf.id, not_none())
        assert_that(leaf.type, is_(LEAF_SCHEDULER_TYPE))
        assert_that(len(leaf.children), is_(1))
        # Verify the owner host
        owner_host = leaf.owner
        assert_that(owner_host, not_none())
        assert_that(owner_host.id, is_(host_config.agent_id))
        assert_that(owner_host.address, is_(agent_host))
        assert_that(owner_host.port, is_(agent_port))
        assert_that(owner_host.parent, is_(leaf))

    def _check_tree(self, root_sch, root_address, root_port, _fanout, agents_list):
        """
        This method checks if a hierarchy is correctly constructed, assuming
        the agents were sequently added to the hierarchy. The check will fail
        on a condition by failing an assertion.
        root_address: a string, root scheduler's address
        root_port: an int, root scheduler's port
        fanout: an integer that specifies the max fanout
        agent_list: a list of tubles (id, address, port), where every tuple
                    represents an agent
        """

        # This method will split a list into multiple lists, where the
        # inner lists represent leaf schedulers i.e.
        # [[leaf1_owner, leaf1_child2 ... ],[leaf2_owner, leaf2_child2 ... ]]
        # leafX_owner is a tuple of (id, address, port)
        def split_list_by_fanout(_list, fanout):
            for i in xrange(0, len(_list), fanout):
                yield _list[i : i + fanout]

        leaves = list(split_list_by_fanout(agents_list, _fanout))

        # check root
        assert_that(root_sch.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_sch.type, is_(ROOT_SCHEDULER_TYPE))
        assert_that(root_sch.owner, not_none())
        assert_that(root_sch.owner.address, is_(root_address))
        assert_that(root_sch.owner.port, is_(root_port))
        assert_that(len(root_sch.children), is_(len(leaves)))

        # Map scheduler hosts, the map will look like this:
        # {leaf_owner_host_id:[(leaf_owner_host_id, address, port)]...}\
        sch_hosts = {}
        for leaf in leaves:
            sch_hosts[leaf[0][0]] = leaf

        for child in root_sch.children.values():
            leaf_owner_id = child.owner.id
            assert_that(leaf_owner_id, is_(sch_hosts[leaf_owner_id][0][0]))
            assert_that(child.parent.owner.id, is_(ROOT_SCHEDULER_ID))
            assert_that(child.owner.address, is_(sch_hosts[leaf_owner_id][0][1]))
            assert_that(child.owner.port, is_(sch_hosts[leaf_owner_id][0][2]))
            assert_that(child.owner.parent, is_(child))
            assert_that(child.type, is_(LEAF_SCHEDULER_TYPE))

            # Veirfy the leaf's child hosts
            children = sch_hosts[leaf_owner_id]

            # map child hosts
            children_map = {}
            for c in children:
                children_map[c[0]] = c

            for child_host in child.children.values():
                assert_that(children_map.get(child_host.id, None), not_none())
                assert_that(children_map[child_host.id][0], is_(child_host.id))
                assert_that(children_map[child_host.id][1], is_(child_host.address))
                assert_that(children_map[child_host.id][2], is_(child_host.port))
                assert_that(child_host.parent, is_(child))

    def wait_for_registration(self, agent_id, timeout=10):
        """Waits for _id to be created in /hosts"""
        completed = threading.Event()

        def wait_created(data, stat, event):
            """Set the event once the node exists."""
            if stat:
                completed.set()

        self.zk_client.DataWatch(ROLES_PREFIX + "/" + agent_id, wait_created)
        completed.wait(timeout)
        assert_that(completed.isSet(), is_(True))

    def _start_agents(self, agent_host, agent_ports):
        """Start agents on different ports.

        When agents register sequentially, the order of events
        of onHostAdded is not guaranteed. For example, if agent
        A, B then C register, they wont be inserted into the hierarchy
        in that order,  a possible order can be B, A then C. Thus,
        we wait on hosts that we think will own a leaf scheduler before
        registering more agents that will go under the same leaf.
        """
        agent_ids = []
        for ind in xrange(len(agent_ports)):
            config = self.runtime.get_agent_config(agent_host, agent_ports[ind], self.chairman_host, self.chairman_port)
            res = self.runtime.start_agent(config)
            agent_client = res[1]
            config_req = THost.GetConfigRequest()
            config = agent_client.get_host_config(config_req).hostConfig
            agent_id = config.agent_id
            if ind % self.leaf_fanout == 0:
                self.wait_for_registration(agent_id)
            agent_ids.append(agent_id)
        return agent_ids

    def test_get_hierarchy_from_zk(self):
        agent_host = "localhost"
        agent_port1 = 20000
        agent_port2 = 20001
        agent_port3 = 20002

        agent_ids = self._start_agents(agent_host, [agent_port1, agent_port2, agent_port3])

        # The chairman will persist the schedulers then push the
        # configurations, thus after we detect that the root scheduler
        # has been configured we know that the leaf schedulers have already
        # been persisted to zk
        _wait_for_configuration(self.root_sch_client, 2)

        root = get_hierarchy_from_zk(self.zk_client)

        agent_list = [
            (agent_ids[0], agent_host, agent_port1),
            (agent_ids[1], agent_host, agent_port2),
            (agent_ids[2], agent_host, agent_port3),
        ]
        # verify the hierarchy structure
        self._check_tree(root, self.root_host, self.root_port, self.leaf_fanout, agent_list)

    def test_get_hierarchy_from_chairman(self):
        agent_host = "localhost"
        agent_port1 = 20000
        agent_port2 = 20001
        agent_port3 = 20002

        agent_ids = self._start_agents(agent_host, [agent_port1, agent_port2, agent_port3])
        _wait_for_configuration(self.root_sch_client, 2)

        root = get_hierarchy_from_chairman(self.chairman_host, self.chairman_port, self.root_host, self.root_port)
        agent_list = [
            (agent_ids[0], agent_host, agent_port1),
            (agent_ids[1], agent_host, agent_port2),
            (agent_ids[2], agent_host, agent_port3),
        ]
        # verify the hierarchy structure
        self._check_tree(root, self.root_host, self.root_port, self.leaf_fanout, agent_list)

    def test_update_status(self):
        agent_host = "localhost"
        agent_port1 = 20000
        config = self.runtime.get_agent_config(agent_host, agent_port1, self.chairman_host, self.chairman_port)
        res = self.runtime.start_agent(config)
        _wait_for_configuration(self.root_sch_client, 1)

        root = get_hierarchy_from_zk(self.zk_client)
        # Update the hierarchy status
        root.update_status()

        # verify that the root scheduler and leaf are online
        assert_that(root.owner.status, is_(STATUS_ONLINE))
        assert_that(len(root.children), is_(1))
        assert_that(root.children.values()[0].owner.status, is_(STATUS_ONLINE))
        # Kill both root scheduler and leaf host
        stop_service(self.runtime.root_procs[0])
        self.runtime.stop_agent(res[0])
        # Update the hierarchy status
        root.update_status()
        assert_that(root.owner.status, is_(STATUS_OFFLINE))
        assert_that(root.children.values()[0].owner.status, is_(STATUS_OFFLINE))

        # Start the root scheduler and leaf scheduler
        self.runtime.start_root_scheduler(self.root_conf)
        config = self.runtime.get_agent_config(agent_host, agent_port1, self.chairman_host, self.chairman_port)
        res = self.runtime.start_agent(config)
        (self.root_transport, self.root_sch_client) = create_root_client(self.root_port, self.root_host)
        # Wait for the root scheduler's leader election
        _wait_on_code(self.root_sch_client.get_schedulers, GetSchedulersResultCode.OK)

        # Check the status again
        root.update_status()
        # verify that the root scheduler and leaf are online
        assert_that(root.owner.status, is_(STATUS_ONLINE))
        assert_that(root.children.values()[0].owner.status, is_(STATUS_ONLINE))

    def test_get_hosts_from_zk(self):
        hosts = get_hosts_from_zk(self.zk_client)
        assert_that(len(hosts), is_(0))

        networks = [Network("nw1", [NetworkType.VM])]
        dsid = str(uuid.uuid4())
        datastores = [Datastore(dsid, "ds1", DatastoreType.SHARED_VMFS)]

        # Register two hosts
        agent_host = "localhost"
        agent1_port = 12345
        req1 = get_register_host_request(
            agent_host,
            agent1_port,
            agent_id="host1",
            networks=networks,
            datastores=datastores,
            image_datastore=dsid,
            availability_zone="az1",
        )
        agent2_port = 12346
        req2 = get_register_host_request(
            agent_host,
            agent2_port,
            agent_id="host2",
            networks=networks,
            datastores=datastores,
            image_datastore=dsid,
            availability_zone="az1",
        )
        # Register two hosts
        resp = self.chairman_client.register_host(req1)
        assert_that(resp.result, is_(RegisterHostResultCode.OK))
        resp = self.chairman_client.register_host(req2)
        assert_that(resp.result, is_(RegisterHostResultCode.OK))

        hosts = get_hosts_from_zk(self.zk_client)
        # map list to dict indexed by host id
        hosts = dict((h.id, h) for h in hosts)
        assert_that(len(hosts), is_(2))
        _h1 = hosts[req1.config.agent_id]
        _h2 = hosts[req2.config.agent_id]
        # Verify that the requests match the hosts that were
        # constructed by get_hosts_from_zk
        assert_that(req1.config.agent_id, _h1.id)
        assert_that(req2.config.agent_id, _h2.id)
        assert_that(req1.config.address.host, _h1.address)
        assert_that(req2.config.address.port, _h2.port)

    def test_get_missing_hosts_from_zk(self):
        missing = get_missing_hosts_from_zk(self.zk_client)
        assert_that(len(missing), is_(0))
        missing_hosts = ["h2", "h3"]
        req = ReportMissingRequest("host1", None, missing_hosts)
        resp = self.chairman_client.report_missing(req)
        assert_that(resp.result, is_(ReportMissingResultCode.OK))

        missing = get_missing_hosts_from_zk(self.zk_client)
        assert_that(missing[0] in missing_hosts, is_(True))
        assert_that(missing[0] in missing_hosts, is_(True))
Ejemplo n.º 5
0
class TestTreeIntrospection(BaseKazooTestCase):
    def setUp(self):
        self.set_up_kazoo_base()
        self.zk_client = self._get_nonchroot_client()
        self.zk_client.start()

        self.runtime = RuntimeUtils(self.id())

        # Create zk paths
        self.zk_client.create(MISSING_PREFIX)
        self.zk_client.create(HOSTS_PREFIX)
        self.zk_client.create(ROLES_PREFIX)

        self.root_conf = {}
        self.root_conf['healthcheck'] = {}
        self.root_conf['zookeeper'] = {}
        self.root_conf['zookeeper']['quorum'] = ("localhost:%i" %
                                                 (DEFAULT_ZK_PORT, ))
        self.root_conf['healthcheck']['timeout_ms'] = ROOT_SCHEDULER_TIME_OUT
        self.root_conf['healthcheck']['period_ms'] = ROOT_SCHEDULER_PERIOD

        # start root scheduler
        self.root_host = "localhost"
        self.root_port = 15000
        self.root_conf['bind'] = self.root_host
        self.root_conf['port'] = self.root_port
        self.runtime.start_root_scheduler(self.root_conf)

        (self.root_transport,
         self.root_sch_client) = create_root_client(self.root_port,
                                                    self.root_host)

        # start chairman
        self.chairman_host = 'localhost'
        self.chairman_port = 13000
        self.leaf_fanout = 2
        self.runtime.start_chairman(self.chairman_host, self.chairman_port,
                                    self.leaf_fanout)
        (self.chairman_transport, self.chairman_client) = \
            create_chairman_client(self.chairman_host, self.chairman_port)
        # Wait for chairman and root scheduler to finish their elections
        _wait_on_code(self.root_sch_client.get_schedulers,
                      GetSchedulersResultCode.OK)
        _wait_on_code(self.chairman_client.get_schedulers,
                      GetSchedulersResultCode.OK, GetSchedulersRequest)

    def tearDown(self):
        self.runtime.cleanup()
        self.zk_client.stop()
        self.zk_client.close()
        self.tear_down_kazoo_base()

    def test_get_service_leader(self):
        """Test get service leader"""
        # Check the chairman leader
        (address, port) = get_service_leader(self.zk_client, CHAIRMAN_SERVICE)
        assert_that(address, is_(self.chairman_host))
        assert_that(port, is_(self.chairman_port))

        deleted = threading.Event()

        def _deleted(children):
            if not children:
                deleted.set()

        self.zk_client.ChildrenWatch(CHAIRMAN_SERVICE, _deleted)
        # Stop chairman
        stop_service(self.runtime.chairman_procs[0])
        # Wait for the leader to leave
        deleted.wait(30)
        res = get_service_leader(self.zk_client, CHAIRMAN_SERVICE)
        assert_that(res, is_(None))

    def test_get_root_scheduler(self):
        """Test root scheduler introspection"""
        (root_host, root_port) = get_service_leader(self.zk_client,
                                                    ROOT_SCHEDULER_SERVICE)
        # Verify that an empty root scheduler is constructed
        # correctly
        root_sch = get_root_scheduler(root_host, root_port)

        assert_that(root_sch.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_sch.type, is_(ROOT_SCHEDULER_TYPE))
        assert_that(len(root_sch.children), is_(0))
        assert_that(root_sch.owner, not_none())
        root_owner = root_sch.owner
        assert_that(root_owner.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_owner.address, is_(root_host))
        assert_that(root_owner.port, is_(root_port))
        assert_that(root_owner.parent, is_(None))

        # Start an agent
        agent_host = 'localhost'
        agent_port = 20000
        config = self.runtime.get_agent_config(agent_host, agent_port,
                                               self.chairman_host,
                                               self.chairman_port)
        res = self.runtime.start_agent(config)
        agent_client = res[1]

        # Wait for the root scheduler to be configured
        _wait_for_configuration(self.root_sch_client, 1)

        new_root_sch = get_root_scheduler(root_host, root_port)
        assert_that(len(new_root_sch.children), is_(1))

        req = THost.GetConfigRequest()
        agent_id = agent_client.get_host_config(req).hostConfig.agent_id

        leaf = new_root_sch.children.values()[0]
        assert_that(leaf.type, is_(LEAF_SCHEDULER_TYPE))
        assert_that(leaf.parent, is_(new_root_sch))
        assert_that(len(leaf.children), is_(0))
        assert_that(leaf.owner.id, is_(agent_id))
        assert_that(leaf.owner.address, is_(agent_host))
        assert_that(leaf.owner.port, is_(agent_port))
        assert_that(leaf.owner.parent, is_(leaf))

        deleted = threading.Event()

        def _deleted(children):
            if not children:
                deleted.set()

        self.zk_client.ChildrenWatch(ROOT_SCHEDULER_SERVICE, _deleted)
        stop_service(self.runtime.root_procs[0])
        # Wait for the leader to leave
        deleted.wait(30)

        emoty_root = get_root_scheduler(root_host, root_port)
        assert_that(emoty_root, is_(emoty_root))

    def test_get_leaf_scheduler(self):
        """Test agent introspection"""

        agent_host = 'localhost'
        agent_port = 20000

        # Agent not online
        leaf = get_leaf_scheduler(agent_host, agent_port)
        assert_that(leaf, is_(None))

        # Start an agent with an invalid chairman, so that it doesn't
        # get configured, because we want to configure it manually
        config = self.runtime.get_agent_config(agent_host, agent_port,
                                               "localhost", 24234)
        res = self.runtime.start_agent(config)
        agent_client = res[1]

        # Agent is online but not a leaf scheduler
        leaf = get_leaf_scheduler(agent_host, agent_port)
        assert_that(leaf, is_(None))

        leafId1 = stable_uuid("leaf scheduler")
        config_req = THost.GetConfigRequest()
        host_config = agent_client.get_host_config(config_req).hostConfig

        leaf_scheduler = SchedulerRole(leafId1)
        leaf_scheduler.parent_id = stable_uuid("parent scheduler")
        leaf_scheduler.hosts = [host_config.agent_id]
        leaf_scheduler.host_children = [
            ChildInfo(id=host_config.agent_id,
                      address=agent_host,
                      port=agent_port)
        ]
        config_request = ConfigureRequest(leafId1, Roles([leaf_scheduler]))

        resp = agent_client.configure(config_request)
        assert_that(resp.result, is_(ConfigureResultCode.OK))

        leaf = get_leaf_scheduler(agent_host, agent_port)

        assert_that(leaf.id, not_none())
        assert_that(leaf.type, is_(LEAF_SCHEDULER_TYPE))
        assert_that(len(leaf.children), is_(1))
        # Verify the owner host
        owner_host = leaf.owner
        assert_that(owner_host, not_none())
        assert_that(owner_host.id, is_(host_config.agent_id))
        assert_that(owner_host.address, is_(agent_host))
        assert_that(owner_host.port, is_(agent_port))
        assert_that(owner_host.parent, is_(leaf))

    def _check_tree(self, root_sch, root_address, root_port, _fanout,
                    agents_list):
        """
        This method checks if a hierarchy is correctly constructed, assuming
        the agents were sequently added to the hierarchy. The check will fail
        on a condition by failing an assertion.
        root_address: a string, root scheduler's address
        root_port: an int, root scheduler's port
        fanout: an integer that specifies the max fanout
        agent_list: a list of tubles (id, address, port), where every tuple
                    represents an agent
        """

        # This method will split a list into multiple lists, where the
        # inner lists represent leaf schedulers i.e.
        # [[leaf1_owner, leaf1_child2 ... ],[leaf2_owner, leaf2_child2 ... ]]
        # leafX_owner is a tuple of (id, address, port)
        def split_list_by_fanout(_list, fanout):
            for i in xrange(0, len(_list), fanout):
                yield _list[i:i + fanout]

        leaves = list(split_list_by_fanout(agents_list, _fanout))

        # check root
        assert_that(root_sch.id, is_(ROOT_SCHEDULER_ID))
        assert_that(root_sch.type, is_(ROOT_SCHEDULER_TYPE))
        assert_that(root_sch.owner, not_none())
        assert_that(root_sch.owner.address, is_(root_address))
        assert_that(root_sch.owner.port, is_(root_port))
        assert_that(len(root_sch.children), is_(len(leaves)))

        # Map scheduler hosts, the map will look like this:
        # {leaf_owner_host_id:[(leaf_owner_host_id, address, port)]...}\
        sch_hosts = {}
        for leaf in leaves:
            sch_hosts[leaf[0][0]] = leaf

        for child in root_sch.children.values():
            leaf_owner_id = child.owner.id
            assert_that(leaf_owner_id, is_(sch_hosts[leaf_owner_id][0][0]))
            assert_that(child.parent.owner.id, is_(ROOT_SCHEDULER_ID))
            assert_that(child.owner.address,
                        is_(sch_hosts[leaf_owner_id][0][1]))
            assert_that(child.owner.port, is_(sch_hosts[leaf_owner_id][0][2]))
            assert_that(child.owner.parent, is_(child))
            assert_that(child.type, is_(LEAF_SCHEDULER_TYPE))

            # Veirfy the leaf's child hosts
            children = sch_hosts[leaf_owner_id]

            # map child hosts
            children_map = {}
            for c in children:
                children_map[c[0]] = c

            for child_host in child.children.values():
                assert_that(children_map.get(child_host.id, None), not_none())
                assert_that(children_map[child_host.id][0], is_(child_host.id))
                assert_that(children_map[child_host.id][1],
                            is_(child_host.address))
                assert_that(children_map[child_host.id][2],
                            is_(child_host.port))
                assert_that(child_host.parent, is_(child))

    def wait_for_registration(self, agent_id, timeout=10):
        """Waits for _id to be created in /hosts"""
        completed = threading.Event()

        def wait_created(data, stat, event):
            """Set the event once the node exists."""
            if stat:
                completed.set()

        self.zk_client.DataWatch(ROLES_PREFIX + "/" + agent_id, wait_created)
        completed.wait(timeout)
        assert_that(completed.isSet(), is_(True))

    def _start_agents(self, agent_host, agent_ports):
        """Start agents on different ports.

        When agents register sequentially, the order of events
        of onHostAdded is not guaranteed. For example, if agent
        A, B then C register, they wont be inserted into the hierarchy
        in that order,  a possible order can be B, A then C. Thus,
        we wait on hosts that we think will own a leaf scheduler before
        registering more agents that will go under the same leaf.
        """
        agent_ids = []
        for ind in xrange(len(agent_ports)):
            config = self.runtime.get_agent_config(agent_host,
                                                   agent_ports[ind],
                                                   self.chairman_host,
                                                   self.chairman_port)
            res = self.runtime.start_agent(config)
            agent_client = res[1]
            config_req = THost.GetConfigRequest()
            config = agent_client.get_host_config(config_req).hostConfig
            agent_id = config.agent_id
            if ind % self.leaf_fanout == 0:
                self.wait_for_registration(agent_id)
            agent_ids.append(agent_id)
        return agent_ids

    def test_get_hierarchy_from_zk(self):
        agent_host = 'localhost'
        agent_port1 = 20000
        agent_port2 = 20001
        agent_port3 = 20002

        agent_ids = self._start_agents(agent_host,
                                       [agent_port1, agent_port2, agent_port3])

        # The chairman will persist the schedulers then push the
        # configurations, thus after we detect that the root scheduler
        # has been configured we know that the leaf schedulers have already
        # been persisted to zk
        _wait_for_configuration(self.root_sch_client, 2)

        root = get_hierarchy_from_zk(self.zk_client)

        agent_list = [(agent_ids[0], agent_host, agent_port1),
                      (agent_ids[1], agent_host, agent_port2),
                      (agent_ids[2], agent_host, agent_port3)]
        # verify the hierarchy structure
        self._check_tree(root, self.root_host, self.root_port,
                         self.leaf_fanout, agent_list)

    def test_get_hierarchy_from_chairman(self):
        agent_host = 'localhost'
        agent_port1 = 20000
        agent_port2 = 20001
        agent_port3 = 20002

        agent_ids = self._start_agents(agent_host,
                                       [agent_port1, agent_port2, agent_port3])
        _wait_for_configuration(self.root_sch_client, 2)

        root = get_hierarchy_from_chairman(self.chairman_host,
                                           self.chairman_port, self.root_host,
                                           self.root_port)
        agent_list = [(agent_ids[0], agent_host, agent_port1),
                      (agent_ids[1], agent_host, agent_port2),
                      (agent_ids[2], agent_host, agent_port3)]
        # verify the hierarchy structure
        self._check_tree(root, self.root_host, self.root_port,
                         self.leaf_fanout, agent_list)

    def test_update_status(self):
        agent_host = 'localhost'
        agent_port1 = 20000
        config = self.runtime.get_agent_config(agent_host, agent_port1,
                                               self.chairman_host,
                                               self.chairman_port)
        res = self.runtime.start_agent(config)
        _wait_for_configuration(self.root_sch_client, 1)

        root = get_hierarchy_from_zk(self.zk_client)
        # Update the hierarchy status
        root.update_status()

        # verify that the root scheduler and leaf are online
        assert_that(root.owner.status, is_(STATUS_ONLINE))
        assert_that(len(root.children), is_(1))
        assert_that(root.children.values()[0].owner.status, is_(STATUS_ONLINE))
        # Kill both root scheduler and leaf host
        stop_service(self.runtime.root_procs[0])
        self.runtime.stop_agent(res[0])
        # Update the hierarchy status
        root.update_status()
        assert_that(root.owner.status, is_(STATUS_OFFLINE))
        assert_that(root.children.values()[0].owner.status,
                    is_(STATUS_OFFLINE))

        # Start the root scheduler and leaf scheduler
        self.runtime.start_root_scheduler(self.root_conf)
        config = self.runtime.get_agent_config(agent_host, agent_port1,
                                               self.chairman_host,
                                               self.chairman_port)
        res = self.runtime.start_agent(config)
        (self.root_transport,
         self.root_sch_client) = create_root_client(self.root_port,
                                                    self.root_host)
        # Wait for the root scheduler's leader election
        _wait_on_code(self.root_sch_client.get_schedulers,
                      GetSchedulersResultCode.OK)

        # Check the status again
        root.update_status()
        # verify that the root scheduler and leaf are online
        assert_that(root.owner.status, is_(STATUS_ONLINE))
        assert_that(root.children.values()[0].owner.status, is_(STATUS_ONLINE))

    def test_get_hosts_from_zk(self):
        hosts = get_hosts_from_zk(self.zk_client)
        assert_that(len(hosts), is_(0))

        networks = [Network("nw1", [NetworkType.VM])]
        dsid = str(uuid.uuid4())
        datastores = [Datastore(dsid, "ds1", DatastoreType.SHARED_VMFS)]

        # Register two hosts
        agent_host = "localhost"
        agent1_port = 12345
        req1 = get_register_host_request(agent_host,
                                         agent1_port,
                                         agent_id="host1",
                                         networks=networks,
                                         datastores=datastores,
                                         image_datastore=dsid,
                                         availability_zone="az1")
        agent2_port = 12346
        req2 = get_register_host_request(agent_host,
                                         agent2_port,
                                         agent_id="host2",
                                         networks=networks,
                                         datastores=datastores,
                                         image_datastore=dsid,
                                         availability_zone="az1")
        # Register two hosts
        resp = self.chairman_client.register_host(req1)
        assert_that(resp.result, is_(RegisterHostResultCode.OK))
        resp = self.chairman_client.register_host(req2)
        assert_that(resp.result, is_(RegisterHostResultCode.OK))

        hosts = get_hosts_from_zk(self.zk_client)
        # map list to dict indexed by host id
        hosts = dict((h.id, h) for h in hosts)
        assert_that(len(hosts), is_(2))
        _h1 = hosts[req1.config.agent_id]
        _h2 = hosts[req2.config.agent_id]
        # Verify that the requests match the hosts that were
        # constructed by get_hosts_from_zk
        assert_that(req1.config.agent_id, _h1.id)
        assert_that(req2.config.agent_id, _h2.id)
        assert_that(req1.config.address.host, _h1.address)
        assert_that(req2.config.address.port, _h2.port)

    def test_get_missing_hosts_from_zk(self):
        missing = get_missing_hosts_from_zk(self.zk_client)
        assert_that(len(missing), is_(0))
        missing_hosts = ["h2", "h3"]
        req = ReportMissingRequest("host1", None, missing_hosts)
        resp = self.chairman_client.report_missing(req)
        assert_that(resp.result, is_(ReportMissingResultCode.OK))

        missing = get_missing_hosts_from_zk(self.zk_client)
        assert_that(missing[0] in missing_hosts, is_(True))
        assert_that(missing[0] in missing_hosts, is_(True))