def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.sleep(10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = self.bucket_util.buckets[0] pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for _ in xrange(times): try: shell = RemoteMachineShellConnection(server) pid = shell.get_memcache_pid() shell.disconnect() break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) self.sleep(2) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") self.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: self.bucket_util._wait_warmup_completed([master], bucket, wait_time=600) # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") self.sleep(30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue( rest.monitorRebalance(), msg="Rebalance failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") self.verification_phase()
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers : num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error("seems rebalance failed!") rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for i in xrange(times): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 2) if pid is None: self.fail("impossible to get a PID") command = 'os:cmd("kill -9 {0} ")'.format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes) ) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)