def init(self): # Add root scheduler root_host = HostStateMachine(self.ROOT_HOST_ID, None, None, None, None, None, None, None, None) root_host.set_start_state(State(HostStateMachine.REGISTERED_STATE)) self.hosts[self.ROOT_HOST_ID] = root_host # Create hosts for host in xrange(self.host_num): _id = str(uuid.uuid4()) client = random.choice(self.clients) datastore_num = self._random_int(self.datastores) networks_num = self._random_int(self.networks) host = HostStateMachine(_id, self.availability_zones, self.datastores, self.networks, datastore_num, networks_num, client, self.agent_host, self.agent_port) host.init_host_properties() host.build() self.hosts[_id] = host self.server = ThriftServer(self.agent_host, self.agent_port, self.handler, None, self.handler) self.servers.append(self.server) self.server.start_server()
def _test_thrift_server(self, expect_ping=True): """ Test that ThriftServer functionalities. ThriftServer by default registers root scheduler and host's processors. ThriftServer uses handlers that implement configure interface. Passing in Observer to collect data received by ThriftServer """ _host_handler = FakeHandler() _root_handler = FakeHandler() _server = ThriftServer( self._address, self._port, _host_handler, None, _root_handler, expect_ping) _server.start_server() host_service_client = self._create_client("Host") rootScheduler_service_client = self._create_client("RootScheduler") agentControl_service_client = self._create_client("AgentControl", 2000) _leaf_scheduler = "leaf scheduler" _root_scheduler = "root scheduler" # send configure to host thrift server and expect result code: OK host_resp = self._configure_host( host_service_client, _leaf_scheduler, "host-1") assert_that(host_resp.result, is_(0)) # send configure to root scheduler thrift server and # expect result code: OK root_sch_resp = self._configure_host( rootScheduler_service_client, _root_scheduler, "ROOT") assert_that(root_sch_resp.result, is_(0)) # test ping to ThriftServer def ping(): ping_req = PingRequest() agentControl_service_client.ping(ping_req) if expect_ping: ping() else: self.assertRaises(TTransportException, ping) _server.stop_server() _host_result = _host_handler.get_result() _root_result = _root_handler.get_result() assert_that("host-1" == _host_result.pop(), is_(True)) assert_that("ROOT" == _root_result.pop(), is_(True))
class Simulator(object): ROOT_HOST_ID = "ROOT_SCHEDULER_HOST" def __init__(self, chairman_list, chairman_clients_num, host_num, agent_host, agent_port, datastores, availability_zones, networks, sleep_min=0, sleep_max=500): self.clients = [] self.hosts = {} self.agent_host = agent_host self.agent_port = agent_port self.host_num = host_num self.handler = AgentHandler(self.hosts) self.datastores = datastores self.availability_zones = availability_zones self.networks = networks self.sleep_min = sleep_min self.sleep_max = sleep_max self.servers = [] self.log = [] # Create clients for x in xrange(chairman_clients_num): chairman = random.choice(chairman_list) chairman = chairman.split(":") client = create_chairman_client(chairman[0], int(chairman[1])) self.clients.append(client[1]) def _random_int(self, lst): return random.randrange(0, len(lst)) + 1 def init(self): # Add root scheduler root_host = HostStateMachine(self.ROOT_HOST_ID, None, None, None, None, None, None, None, None) root_host.set_start_state(State(HostStateMachine.REGISTERED_STATE)) self.hosts[self.ROOT_HOST_ID] = root_host # Create hosts for host in xrange(self.host_num): _id = str(uuid.uuid4()) client = random.choice(self.clients) datastore_num = self._random_int(self.datastores) networks_num = self._random_int(self.networks) host = HostStateMachine(_id, self.availability_zones, self.datastores, self.networks, datastore_num, networks_num, client, self.agent_host, self.agent_port) host.init_host_properties() host.build() self.hosts[_id] = host self.server = ThriftServer(self.agent_host, self.agent_port, self.handler, None, self.handler) self.servers.append(self.server) self.server.start_server() def sync_machines(self): """ Since the state machine will transition a random number of times, the last transition might not process a host config, even if it exists. This method, will sync all state machines that are in a registered or configured state and have a new config, that hasn't been processed. """ for host in self.hosts.values(): if host.id == self.ROOT_HOST_ID: continue host.sync() def run_batch(self, size): for x in xrange(size): random_host = random.choice(self.hosts.values()) while random_host.id == self.ROOT_HOST_ID: random_host = random.choice(self.hosts.values()) self.log.append(random_host) random_host.transition() sleep_ms = (random.randint(self.sleep_min, self.sleep_max) / 1000.0) time.sleep(sleep_ms) def clean_up(self): for server in self.servers: try: server.stop_server() except Exception: pass
class TestChairman(BaseKazooTestCase): """ Test starts a zookeeper server and a chairman process. """ def get_register_host_request(self, port=8080): """ Generates a random register host request which has same datastore and same availability zone """ host_id = str(uuid.uuid4()) if not hasattr(self, "image_datastore"): self.image_datastore = str(uuid.uuid4()) datastores = [Datastore(self.image_datastore)] networks = [Network("nw1", [NetworkType.VM])] host_config = HostConfig(agent_id=host_id, datastores=datastores, address=ServerAddress("127.0.0.1", port=port), networks=networks) host_config.availability_zone = "foo" host_config.image_datastore_ids = set(self.image_datastore) return RegisterHostRequest(host_id, host_config) def report_missing(self): """ Generates a random missing request with two hosts and two schedulers. """ scheduler_id = str(uuid.uuid4()) schedulers = [str(uuid.uuid4()), str(uuid.uuid4())] hosts = [str(uuid.uuid4()), str(uuid.uuid4())] return ReportMissingRequest(scheduler_id, schedulers, hosts) def report_missing_hosts(self): scheduler_id = str(uuid.uuid4()) hosts = [str(uuid.uuid4()), str(uuid.uuid4())] return ReportMissingRequest(scheduler_id, None, hosts) def disconnect(self): """ Disconnect from the chairman instance and close the transport """ for transport in self.transports: transport.close() def setUp(self): self.set_up_kazoo_base() self.procs = [] self.thrift_server = None self.transports = [] self.runtime = RuntimeUtils(self.id()) self.runtime.start_cloud_store() host, port = '127.0.0.1', 13000 self.runtime.start_chairman(host, port) (transport, self.chairman_client) = create_chairman_client(host, port) self.transports.append(transport) # Wait for chairman to finish their elections _wait_on_code(self.chairman_client.get_schedulers, GetSchedulersResultCode.OK, GetSchedulersRequest) def tearDown(self): if self.thrift_server: self.thrift_server.stop_server() self.disconnect() for proc in self.procs: stop_service(proc) self.runtime.cleanup() self.tear_down_kazoo_base() def test_get_status(self): chairman_client_leader = self.chairman_client # starts 2 more chairman instances host = '127.0.0.1' port_1 = 13001 port_2 = 13002 self.runtime.start_chairman(host, port_1) self.runtime.start_chairman(host, port_2) (transport_1, chairman_client_non_leader_1) = \ create_chairman_client(host, port_1) (transport_2, chairman_client_non_leader_2) = \ create_chairman_client(host, port_2) self.transports.append(transport_1) self.transports.append(transport_2) h_id_config = {} # Register two hosts with the chairman leader. reg_host_req_1 = self.get_register_host_request() server_address = reg_host_req_1.config.address.host server_port = reg_host_req_1.config.address.port h_id_config[reg_host_req_1.id] = reg_host_req_1.config host_handler = AgentHandler(2) self.thrift_server = ThriftServer(server_address, server_port, host_handler) self.thrift_server.start_server() rc = chairman_client_leader.register_host(reg_host_req_1) self.assertEqual(rc.result, RegisterHostResultCode.OK) reg_host_req_2 = self.get_register_host_request() h_id_config[reg_host_req_2.id] = reg_host_req_2.config rc = self.chairman_client.register_host(reg_host_req_2) self.assertEqual(rc.result, RegisterHostResultCode.OK) # verify that all the agents received configurations host_handler.received_all.wait(20) assert_that(len(host_handler.configs), is_(len(h_id_config))) # verify chairman leader is in READY status get_status_request = GetStatusRequest() rc = chairman_client_leader.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) # verify the second chairman which is not leader is in READY status # while it notices that there is a leader exist rc = chairman_client_non_leader_1.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) # verify the third chairman which is not leader is in READY status # while it notices that there is a leader exist rc = chairman_client_non_leader_2.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) client = self._get_nonchroot_client() client.start() read_chairman_leader = client.get_children(CHAIRMAN_SERVICE)[0] # normalize from unicode to str read_chairman_leader = \ normalize('NFKD', read_chairman_leader).encode('ascii', 'ignore') # expecting chairman leader being deleted leader_deleted_event = async_wait_for(EventType.DELETED, CHAIRMAN_SERVICE, read_chairman_leader, client) # deleting chairman leader from service client.delete(CHAIRMAN_SERVICE + "/" + read_chairman_leader) leader_deleted_event.wait(20) self.assertTrue(leader_deleted_event.isSet()) def wait_for_status(chairman_client, status): retries = 0 while (retries < 10): try: rc = chairman_client.get_status(get_status_request) if rc.type != status: break except: logger.exception("get_status() failed") retries += 1 time.sleep(1) return rc def test_register_host(self): """ Register against the chairman and verify it is persisted in zk """ host_prefix = "/hosts" # Const in java impl. # Register two hosts with the chairman. host = [] h = self.get_register_host_request() host.append(h.config) retries = 0 while (retries < 10): rc = self.chairman_client.register_host(h) if (rc.result == RegisterHostResultCode.NOT_IN_MAJORITY): # Possible because the chairman is yet to connect to zk retries += 1 time.sleep(1) continue break self.assertEqual(rc.result, RegisterHostResultCode.OK) h = self.get_register_host_request() host.append(h.config) rc = self.chairman_client.register_host(h) self.assertEqual(rc.result, RegisterHostResultCode.OK) # Validate the state persisted in zk. client = self._get_nonchroot_client() client.start() self.assertTrue(client.exists(host_prefix)) read_hosts = client.get_children(host_prefix) for h in read_hosts: path = host_prefix + "/" + h (value, stat) = client.get(path) host_config = HostConfig() deserialize(host_config, value) self.assertTrue(host_config in host) client.stop() def test_report_missing(self): """ Report a set of hosts are missing to the chairman and verify that it is persisted in zk. """ missing_prefix = "/missing" # Report two hosts are missing. missing_req = self.report_missing() retries = 0 while (retries < 10): rc = self.chairman_client.report_missing(missing_req) if (rc.result == ReportMissingResultCode.NOT_IN_MAJORITY): # Possible because the chairman is yet to connect to zk retries += 1 time.sleep(1) continue break self.assertEqual(rc.result, ReportMissingResultCode.OK) nodes = missing_req.hosts nodes += missing_req.schedulers missing_req = self.report_missing_hosts() rc = self.chairman_client.report_missing(missing_req) self.assertEqual(rc.result, ReportMissingResultCode.OK) nodes += missing_req.hosts # Validate the state persisted in zk. client = self._get_nonchroot_client() client.start() self.assertTrue(client.exists(missing_prefix)) missing_hosts = client.get_children(missing_prefix) self.assertEqual(len(missing_hosts), len(nodes)) for host in missing_hosts: self.assertTrue(host in nodes) client.stop() def test_unregister_host(self): """unregister host after host being registered or missing """ h_id_config = {} # Register two hosts with the chairman. reg_host_req_1 = self.get_register_host_request() server_address = reg_host_req_1.config.address.host server_port = reg_host_req_1.config.address.port h_id_config[reg_host_req_1.id] = reg_host_req_1.config host_handler = AgentHandler(2) self.thrift_server = ThriftServer(server_address, server_port, host_handler) self.thrift_server.start_server() rc = self.chairman_client.register_host(reg_host_req_1) self.assertEqual(rc.result, RegisterHostResultCode.OK) reg_host_req_2 = self.get_register_host_request() h_id_config[reg_host_req_2.id] = reg_host_req_2.config rc = self.chairman_client.register_host(reg_host_req_2) self.assertEqual(rc.result, RegisterHostResultCode.OK) # verify that all the agents received configurations host_handler.received_all.wait(30) assert_that(len(host_handler.configs), is_(len(h_id_config))) client = self._get_nonchroot_client() client.start() # Verify that the host has registered with chairman host_registered = wait_for(EventType.CREATED, HOSTS_PREFIX, h_id_config.keys()[0], client) self.assertTrue(host_registered) host_registered = wait_for(EventType.CREATED, HOSTS_PREFIX, h_id_config.keys()[1], client) self.assertTrue(host_registered) # validate /hosts read_hosts = client.get_children(HOSTS_PREFIX) self.assertEqual(len(read_hosts), len(h_id_config)) for h in read_hosts: path = HOSTS_PREFIX + "/" + h host_config = extract_node_data(client, path, HostConfig) self.assertTrue(host_config in h_id_config.values()) # validate only one leaf scheduler for same # availability_zone/datastore agents roles_registered = wait_for(EventType.CREATED, ROLES_PREFIX, "", client, "get_children") roles_hosts = client.get_children(ROLES_PREFIX) self.assertTrue(roles_registered) self.assertEqual(len(roles_hosts), 1) # preserve the host id that owns the leaf scheduler leaf_scheduler_host_id = None # validate leaf scheduler has 2 hosts r_id = roles_hosts[0] leaf_scheduler_host_id = r_id path = ROLES_PREFIX + "/" + r_id roles = extract_node_data(client, path, Roles) scheduler_role = roles.schedulers[0] self.assertEqual(len(scheduler_role.hosts), 2) self.assertTrue(h_id_config.keys()[0] in scheduler_role.hosts) self.assertTrue(h_id_config.keys()[1] in scheduler_role.hosts) # normalize from unicode to str leaf_scheduler_host_id = \ normalize('NFKD', leaf_scheduler_host_id).encode('ascii', 'ignore') # validate report missing del h_id_config[leaf_scheduler_host_id] missing_host_id = h_id_config.keys()[0] missing_host_list = h_id_config.keys() scheduler_id = host_handler.configs[missing_host_id].scheduler missing_req = ReportMissingRequest(scheduler_id, None, missing_host_list) m_hosts = missing_req.hosts rc = self.chairman_client.report_missing(missing_req) self.assertEqual(rc.result, ReportMissingResultCode.OK) # validate /missing host_missing = wait_for(EventType.CREATED, MISSING_PREFIX, missing_host_id, client) self.assertTrue(host_missing) missing_hosts = client.get_children(MISSING_PREFIX) self.assertEqual(len(missing_hosts), len(m_hosts)) for host in missing_hosts: self.assertTrue(host in m_hosts) # expecting role changed role_changed_event = async_wait_for(EventType.CHANGED, ROLES_PREFIX, leaf_scheduler_host_id, client) # unregister missing host unreg_req = UnregisterHostRequest(missing_host_id) rc = self.chairman_client.unregister_host(unreg_req) self.assertEqual(rc.result, UnregisterHostResultCode.OK) role_changed_event.wait(20) self.assertTrue(role_changed_event.isSet()) # Validate /missing after unregister host missing_hosts = client.get_children(MISSING_PREFIX) self.assertEqual(len(missing_hosts), 0) # validate leaf scheduler's host number after unregistered a host roles_hosts = client.get_children(ROLES_PREFIX) r_id = roles_hosts[0] path = ROLES_PREFIX + "/" + r_id roles = extract_node_data(client, path, Roles) scheduler_role = roles.schedulers[0] self.assertEqual(len(scheduler_role.hosts), 1) self.assertTrue(r_id in scheduler_role.hosts) self.assertTrue(missing_host_id not in scheduler_role.hosts) # expecting role being deleted role_changed_event = async_wait_for(EventType.DELETED, ROLES_PREFIX, leaf_scheduler_host_id, client) # unregister host that owns leaf scheduler unreg_req = UnregisterHostRequest(leaf_scheduler_host_id) rc = self.chairman_client.unregister_host(unreg_req) self.assertEqual(rc.result, UnregisterHostResultCode.OK) role_changed_event.wait(20) self.assertTrue(role_changed_event.isSet()) # Validate the state persisted in zk. read_hosts = client.get_children(HOSTS_PREFIX) self.assertEqual(len(read_hosts), 0) client.stop()
def test_unregister_host(self): """unregister host after host being registered or missing """ h_id_config = {} # Register two hosts with the chairman. reg_host_req_1 = self.get_register_host_request() server_address = reg_host_req_1.config.address.host server_port = reg_host_req_1.config.address.port h_id_config[reg_host_req_1.id] = reg_host_req_1.config host_handler = AgentHandler(2) self.thrift_server = ThriftServer(server_address, server_port, host_handler) self.thrift_server.start_server() rc = self.chairman_client.register_host(reg_host_req_1) self.assertEqual(rc.result, RegisterHostResultCode.OK) reg_host_req_2 = self.get_register_host_request() h_id_config[reg_host_req_2.id] = reg_host_req_2.config rc = self.chairman_client.register_host(reg_host_req_2) self.assertEqual(rc.result, RegisterHostResultCode.OK) # verify that all the agents received configurations host_handler.received_all.wait(30) assert_that(len(host_handler.configs), is_(len(h_id_config))) client = self._get_nonchroot_client() client.start() # Verify that the host has registered with chairman host_registered = wait_for(EventType.CREATED, HOSTS_PREFIX, h_id_config.keys()[0], client) self.assertTrue(host_registered) host_registered = wait_for(EventType.CREATED, HOSTS_PREFIX, h_id_config.keys()[1], client) self.assertTrue(host_registered) # validate /hosts read_hosts = client.get_children(HOSTS_PREFIX) self.assertEqual(len(read_hosts), len(h_id_config)) for h in read_hosts: path = HOSTS_PREFIX + "/" + h host_config = extract_node_data(client, path, HostConfig) self.assertTrue(host_config in h_id_config.values()) # validate only one leaf scheduler for same # availability_zone/datastore agents roles_registered = wait_for(EventType.CREATED, ROLES_PREFIX, "", client, "get_children") roles_hosts = client.get_children(ROLES_PREFIX) self.assertTrue(roles_registered) self.assertEqual(len(roles_hosts), 1) # preserve the host id that owns the leaf scheduler leaf_scheduler_host_id = None # validate leaf scheduler has 2 hosts r_id = roles_hosts[0] leaf_scheduler_host_id = r_id path = ROLES_PREFIX + "/" + r_id roles = extract_node_data(client, path, Roles) scheduler_role = roles.schedulers[0] self.assertEqual(len(scheduler_role.hosts), 2) self.assertTrue(h_id_config.keys()[0] in scheduler_role.hosts) self.assertTrue(h_id_config.keys()[1] in scheduler_role.hosts) # normalize from unicode to str leaf_scheduler_host_id = \ normalize('NFKD', leaf_scheduler_host_id).encode('ascii', 'ignore') # validate report missing del h_id_config[leaf_scheduler_host_id] missing_host_id = h_id_config.keys()[0] missing_host_list = h_id_config.keys() scheduler_id = host_handler.configs[missing_host_id].scheduler missing_req = ReportMissingRequest(scheduler_id, None, missing_host_list) m_hosts = missing_req.hosts rc = self.chairman_client.report_missing(missing_req) self.assertEqual(rc.result, ReportMissingResultCode.OK) # validate /missing host_missing = wait_for(EventType.CREATED, MISSING_PREFIX, missing_host_id, client) self.assertTrue(host_missing) missing_hosts = client.get_children(MISSING_PREFIX) self.assertEqual(len(missing_hosts), len(m_hosts)) for host in missing_hosts: self.assertTrue(host in m_hosts) # expecting role changed role_changed_event = async_wait_for(EventType.CHANGED, ROLES_PREFIX, leaf_scheduler_host_id, client) # unregister missing host unreg_req = UnregisterHostRequest(missing_host_id) rc = self.chairman_client.unregister_host(unreg_req) self.assertEqual(rc.result, UnregisterHostResultCode.OK) role_changed_event.wait(20) self.assertTrue(role_changed_event.isSet()) # Validate /missing after unregister host missing_hosts = client.get_children(MISSING_PREFIX) self.assertEqual(len(missing_hosts), 0) # validate leaf scheduler's host number after unregistered a host roles_hosts = client.get_children(ROLES_PREFIX) r_id = roles_hosts[0] path = ROLES_PREFIX + "/" + r_id roles = extract_node_data(client, path, Roles) scheduler_role = roles.schedulers[0] self.assertEqual(len(scheduler_role.hosts), 1) self.assertTrue(r_id in scheduler_role.hosts) self.assertTrue(missing_host_id not in scheduler_role.hosts) # expecting role being deleted role_changed_event = async_wait_for(EventType.DELETED, ROLES_PREFIX, leaf_scheduler_host_id, client) # unregister host that owns leaf scheduler unreg_req = UnregisterHostRequest(leaf_scheduler_host_id) rc = self.chairman_client.unregister_host(unreg_req) self.assertEqual(rc.result, UnregisterHostResultCode.OK) role_changed_event.wait(20) self.assertTrue(role_changed_event.isSet()) # Validate the state persisted in zk. read_hosts = client.get_children(HOSTS_PREFIX) self.assertEqual(len(read_hosts), 0) client.stop()
def test_get_status(self): chairman_client_leader = self.chairman_client # starts 2 more chairman instances host = '127.0.0.1' port_1 = 13001 port_2 = 13002 self.runtime.start_chairman(host, port_1) self.runtime.start_chairman(host, port_2) (transport_1, chairman_client_non_leader_1) = \ create_chairman_client(host, port_1) (transport_2, chairman_client_non_leader_2) = \ create_chairman_client(host, port_2) self.transports.append(transport_1) self.transports.append(transport_2) h_id_config = {} # Register two hosts with the chairman leader. reg_host_req_1 = self.get_register_host_request() server_address = reg_host_req_1.config.address.host server_port = reg_host_req_1.config.address.port h_id_config[reg_host_req_1.id] = reg_host_req_1.config host_handler = AgentHandler(2) self.thrift_server = ThriftServer(server_address, server_port, host_handler) self.thrift_server.start_server() rc = chairman_client_leader.register_host(reg_host_req_1) self.assertEqual(rc.result, RegisterHostResultCode.OK) reg_host_req_2 = self.get_register_host_request() h_id_config[reg_host_req_2.id] = reg_host_req_2.config rc = self.chairman_client.register_host(reg_host_req_2) self.assertEqual(rc.result, RegisterHostResultCode.OK) # verify that all the agents received configurations host_handler.received_all.wait(20) assert_that(len(host_handler.configs), is_(len(h_id_config))) # verify chairman leader is in READY status get_status_request = GetStatusRequest() rc = chairman_client_leader.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) # verify the second chairman which is not leader is in READY status # while it notices that there is a leader exist rc = chairman_client_non_leader_1.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) # verify the third chairman which is not leader is in READY status # while it notices that there is a leader exist rc = chairman_client_non_leader_2.get_status(get_status_request) self.assertEqual(rc.type, StatusType.READY) client = self._get_nonchroot_client() client.start() read_chairman_leader = client.get_children(CHAIRMAN_SERVICE)[0] # normalize from unicode to str read_chairman_leader = \ normalize('NFKD', read_chairman_leader).encode('ascii', 'ignore') # expecting chairman leader being deleted leader_deleted_event = async_wait_for(EventType.DELETED, CHAIRMAN_SERVICE, read_chairman_leader, client) # deleting chairman leader from service client.delete(CHAIRMAN_SERVICE + "/" + read_chairman_leader) leader_deleted_event.wait(20) self.assertTrue(leader_deleted_event.isSet()) def wait_for_status(chairman_client, status): retries = 0 while (retries < 10): try: rc = chairman_client.get_status(get_status_request) if rc.type != status: break except: logger.exception("get_status() failed") retries += 1 time.sleep(1) return rc