def rebalancing(self, task_manager): rest = RestConnection(self.servers[0]) progress = rest._rebalance_progress() if progress is not -1 and progress is not 100: task_manager.schedule(self, 10) else: self.state = "finishing" self.set_result({"status": "success", "value": None})
def check(self, task_manager): rest = RestConnection(self.servers[0]) try: progress = rest._rebalance_progress() except RebalanceFailedException as ex: self.state = FINISHED self.set_exception(ex) if progress != -1 and progress != 100: task_manager.schedule(self, 10) else: self.log.info("rebalancing was completed with progress: {0}%".format(progress)) self.state = FINISHED self.set_result(True)
def rebalancing(self, task_manager): rest = RestConnection(self.servers[0]) try: progress = rest._rebalance_progress() if progress is not -1 and progress is not 100: if self.do_stop and progress >= self.progress: self.state = "stop_rebalance" task_manager.schedule(self, 1) else: task_manager.schedule(self, 1) else: self.state = "finishing" self.set_result({"status": "success", "value": None}) self.log.info("tcmalloc fragmentation stats after Rebalance ") self.getStats(self.servers[0]) except Exception as e: self.state = "finishing" self.set_result({"status": "error", "value": e})
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error("rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info("Rebalance will be stopped with {0}%".format(progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def test_rebalance_inout_with_durability_failure(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add nodes using 'self.nodes_in' such that, replica_number > nodes_in_cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance 5. Add make a node and do CRUD on the bucket 6. Verify durability works after node addition Note: This is a Negative case. i.e: Durability will be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # TODO: Enable verification """ vbucket_info_dict = dict() # Cb stat object for verification purpose master_shell_conn = RemoteMachineShellConnection(master) master_node_cb_stat = Cbstats(master_shell_conn) # Update each vbucket's seq_no for latest value for verification for vb_num in range(0, self.vbuckets): vbucket_info_dict[vb_num] = master_node_cb_stat.vbucket_seqno( def_bucket.name, vb_num, "abs_high_seqno") """ # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # TODO: There will be failure in doc_count verification due to # swap_rebalance. Need to update verification steps accordingly to # satisfy this self.verification_phase() # Add back first ejected node back into the cluster self.task.rebalance(self.cluster.nodes_in_cluster, [toBeEjectedNodes[0]], []) # Load doc into all vbuckets to verify durability gen_create = doc_generator('test_', 0, self.num_items) task = self.task.async_load_gen_docs_atomicity( self.cluster, def_bucket, gen_create, self.op_type, exp=0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit) self.task_manager.get_task_result(task)
def test_rebalance_inout_with_durability_check(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add 'self.nodes_in' nodes into the cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance Note: This is a Positive case. i.e: Durability should not be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # Update replica value before performing rebalance in/out if self.replica_to_update: bucket_helper = BucketHelper(self.cluster.master) # Recalculate replicate_to/persist_to as per new replica value if self.self.durability_level is None: self.replicate_to = floor(self.replica_to_update / 2) + 1 self.persist_to = floor(self.replica_to_update / 2) + 2 # Update bucket replica to new value as given in conf file self.log.info("Updating replica count of bucket to {0}".format( self.replica_to_update)) bucket_helper.change_bucket_props( def_bucket.name, replicaNumber=self.replica_to_update) # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.cluster.master rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[1:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status = self.task.rebalance(self.cluster.servers[:self.nodes_init], intial_severs, []) self.assertTrue(status, msg="Rebalance was failed") self.log.info("CREATE BUCKET PHASE") self.create_buckets() self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) time.sleep(1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") time.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 #we expect that rebalance will be failed while rest._rebalance_progress_status() == "running" and i < 60: self.log.info("rebalance progress: {0}".format(rest._rebalance_progress())) time.sleep(1) i += 1 self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status())) knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion self.log.info("STOP/START SWAP REBALANCE PHASE") retry = 0 for expected_progress in (20, 40, 60): while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance is already reached") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") time.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 time.sleep(1) #self.assertTrue(reached, "rebalance failed or did not reach {0}%".format(expected_progress)) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) #RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)