def test_observe_with_warmup(self): self._load_doc_data_all_buckets('create', 0, self.num_items) # Persist all the loaded data item self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init]) for bucket in self.buckets: RebalanceHelper.wait_for_persistence(self.master, bucket) self._stats_befor_warmup(bucket.name) self._restart_memcache(bucket.name) # for bucket in self.buckets: ClusterOperationHelper._wait_warmup_completed(self, self.servers[:self.nodes_init], bucket.name) self._run_observe(self)
def test_observe_with_warmup(self): self._load_doc_data_all_buckets('create', 0, self.num_items) # Persist all the loaded data item self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init]) for bucket in self.buckets: RebalanceHelper.wait_for_persistence(self.master, bucket) self._stats_befor_warmup(bucket.name) self._restart_memcache(bucket.name) # for bucket in self.buckets: ClusterOperationHelper._wait_warmup_completed( self, self.servers[:self.nodes_init], bucket.name) self._run_observe()
def rebalance_out_with_warming_up(self): master_restart = self.input.param("master_restart", False) if master_restart: warmup_node = self.master else: warmup_node = self.servers[len(self.servers) - self.nodes_out - 1] servs_out = self.servers[len(self.servers) - self.nodes_out:] shell = RemoteMachineShellConnection(warmup_node) shell.stop_couchbase() self.sleep(20) shell.start_couchbase() shell.disconnect() try: rebalance = self.cluster.async_rebalance(self.servers, [], servs_out) rebalance.result() except RebalanceFailedException: self.log.info("rebalance was failed as expected") self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \ self.default_bucket_name, wait_time=self.wait_timeout * 10)) self.log.info("second attempt to rebalance") rebalance = self.cluster.async_rebalance(self.servers, [], servs_out) rebalance.result() self.verify_cluster_stats(self.servers[:len(self.servers) - self.nodes_out]) self.verify_unacked_bytes_all_buckets()
def rebalance_out_with_warming_up(self): master_restart = self.input.param("master_restart", False) if master_restart: warmup_node = self.master else: warmup_node = self.servers[len(self.servers) - self.nodes_out - 1] servs_out = self.servers[len(self.servers) - self.nodes_out :] shell = RemoteMachineShellConnection(warmup_node) shell.stop_couchbase() self.sleep(20) shell.start_couchbase() shell.disconnect() try: rebalance = self.cluster.async_rebalance(self.servers, [], servs_out) rebalance.result() except RebalanceFailedException: self.log.info("rebalance was failed as expected") self.assertTrue( ClusterOperationHelper._wait_warmup_completed( self, [warmup_node], self.default_bucket_name, wait_time=self.wait_timeout * 10 ) ) self.log.info("second attempt to rebalance") rebalance = self.cluster.async_rebalance(self.servers, [], servs_out) rebalance.result() self.verify_cluster_stats(self.servers[: len(self.servers) - self.nodes_out]) self.verify_unacked_bytes_all_buckets()
def rebalance_in_with_warming_up(self): servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_init = self.servers[:self.nodes_init] warmup_node = servs_init[-1] shell = RemoteMachineShellConnection(warmup_node) shell.stop_couchbase() self.sleep(20) shell.start_couchbase() shell.disconnect() try: rebalance = self.cluster.async_rebalance( servs_init, servs_in, [], sleep_before_rebalance=self.sleep_before_rebalance) rebalance.result() except RebalanceFailedException: self.log.info("rebalance was failed as expected") self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \ self.default_bucket_name, wait_time=self.wait_timeout * 10)) self.log.info("second attempt to rebalance") rebalance = self.cluster.async_rebalance( servs_init + servs_in, [], [], sleep_before_rebalance=self.sleep_before_rebalance) rebalance.result() self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init]) self.verify_unacked_bytes_all_buckets()
def rebalance_in_with_warming_up(self): servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_init = self.servers[:self.nodes_init] warmup_node = servs_init[-1] shell = RemoteMachineShellConnection(warmup_node) shell.stop_couchbase() self.sleep(20) shell.start_couchbase() shell.disconnect() try: rebalance = self.cluster.async_rebalance(servs_init, servs_in, []) rebalance.result() except RebalanceFailedException: self.log.info("rebalance was failed as expected") self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \ self.default_bucket_name, wait_time=self.wait_timeout * 10)) self.log.info("second attempt to rebalance") rebalance = self.cluster.async_rebalance(servs_init + servs_in, [], []) rebalance.result() self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init])
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached == 100 and not RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') self.log.info("Latest logs from UI:") for i in rest.get_logs(): self.log.error(i) self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for i in xrange(times): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except (EOFError, KeyError) as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 2) if pid is None: # sometimes pid is not returned by mc.stats() shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) SwapRebalanceBase.sleep(self, 10, "Wait for rebalance to start") self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) time.sleep(1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") time.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 #we expect that rebalance will be failed while rest._rebalance_progress_status() == "running" and i < 60: self.log.info("rebalance progress: {0}".format(rest._rebalance_progress())) time.sleep(1) i += 1 self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status())) knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) SwapRebalanceBase.verification_phase(self, master)