def test_MB_34947(self): # Update already Created docs with async_writes load_gen = doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, vbuckets=self.cluster.vbuckets) task = self.task.async_load_gen_docs( self.cluster, self.def_bucket, load_gen, "update", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, timeout_secs=self.sdk_timeout, batch_size=10, process_concurrency=8) self.task.jython_task_manager.get_task_result(task) # Update bucket replica to new value bucket_helper = BucketHelper(self.cluster.master) bucket_helper.change_bucket_props( self.def_bucket, replicaNumber=self.new_replica) self.bucket_util.print_bucket_stats(self.cluster) # Start rebalance task rebalance = self.task.async_rebalance(self.cluster.servers, [], []) self.sleep(10, "Wait for rebalance to start") # Wait for rebalance task to complete self.task.jython_task_manager.get_task_result(rebalance) # Assert if rebalance failed self.assertTrue(rebalance.result, "Rebalance failed after replica update")
def update_bucket_replica(self): self.log.info("Updating all the bucket replicas to {0}".format( self.replicas_for_failover)) for i in range(len(self.bucket_util.buckets)): bucket_helper = BucketHelper(self.cluster.master) bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=self.replicas_for_failover) task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.task.jython_task_manager.get_task_result(task) self.log.info("Bucket stats before failover") self.bucket_util.print_bucket_stats()
def test_multiple_scenarios(self): """ Test multiple rebalance scenarios in single test with CRUDs in parallel 1. Rebalance_out orchestrator node 2. Rebalance_in nodes as given in nodes_in param 3. Update replica and do rebalance 4. Rebalance_out nodes as given in nodes_out param 5. Do Plain CRUDs at the end of all this to verify the cluster status """ # Local function to wait for all crud task to complete def wait_for_crud_task_and_verify_for_no_errors(tasks_info): if not self.atomicity: self.bucket_util.verify_doc_op_task_exceptions( tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) for task, task_info in tasks_info.items(): self.assertFalse( task_info["ops_failed"], "Doc ops failed for task: {}".format(task.thread_name)) self.assertTrue(self.replica_to_update is not None) def_bucket = self.bucket_util.buckets[0] servers_in = [ self.cluster.servers[self.nodes_init + i] for i in range(self.nodes_in) ] servers_out = [ self.cluster.servers[self.nodes_init - i - 1] for i in range(self.nodes_out) ] # Start CRUD operations crud_tasks = self.__load_docs_in_all_buckets() # Rebalance_out the orchestrator node rebalance_result = self.task.rebalance( self.cluster.servers[:self.nodes_init], [], [self.cluster.servers[0]]) self.assertTrue(rebalance_result, "Rebalance out orchestrator node failed") # Wait for all CRUD tasks to complete and verify no failures are seen self.cluster.master = self.servers[1] wait_for_crud_task_and_verify_for_no_errors(crud_tasks) self.cluster.nodes_in_cluster = self.servers[1:self.nodes_init] # Start CRUD operations crud_tasks = self.__load_docs_in_all_buckets() # Rebalance_in multiple cluster nodes self.add_remove_servers_and_rebalance(servers_in, []) wait_for_crud_task_and_verify_for_no_errors(crud_tasks) # Start CRUD operations crud_tasks = self.__load_docs_in_all_buckets() # Update bucket replica value bucket_helper = BucketHelper(self.cluster.servers[1]) bucket_helper.change_bucket_props(def_bucket, replicaNumber=self.replica_to_update) # Start and wait till rebalance is complete rebalance = self.task.async_rebalance(self.cluster.nodes_in_cluster, [], []) self.task.jython_task_manager.get_task_result(rebalance) wait_for_crud_task_and_verify_for_no_errors(crud_tasks) # Start CRUD operations crud_tasks = self.__load_docs_in_all_buckets() # Rebalance_out multiple cluster nodes self.add_remove_servers_and_rebalance([], servers_out) wait_for_crud_task_and_verify_for_no_errors(crud_tasks) # Start CRUD operations crud_tasks = self.__load_docs_in_all_buckets() wait_for_crud_task_and_verify_for_no_errors(crud_tasks) # Doc count verification if not self.atomicity: self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items)
def test_rebalance_inout_with_durability_check(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add 'self.nodes_in' nodes into the cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance Note: This is a Positive case. i.e: Durability should not be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # Update replica value before performing rebalance in/out if self.replica_to_update: bucket_helper = BucketHelper(self.cluster.master) # Recalculate replicate_to/persist_to as per new replica value if self.self.durability_level is None: self.replicate_to = floor(self.replica_to_update / 2) + 1 self.persist_to = floor(self.replica_to_update / 2) + 2 # Update bucket replica to new value as given in conf file self.log.info("Updating replica count of bucket to {0}".format( self.replica_to_update)) bucket_helper.change_bucket_props( def_bucket.name, replicaNumber=self.replica_to_update) # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def test_volume_taf(self): self.loop = 0 # self.cluster_utils.set_metadata_purge_interval() if self.number_of_indexes > 0: # start running select queries thread self.query_thread = threading.Thread(target=self.run_select_query) self.query_thread_flag = True self.query_thread.start() # Start running ui stats queries thread self.ui_stats_thread = threading.Thread( target=self.run_ui_stats_queries) self.ui_stats_thread_flag = True self.ui_stats_thread.start() self.log.info("Finished steps 1-4 successfully in setup") while self.loop < self.iterations: if self.loop > 0 or self.flush_buckets_before_indexes_creation: self.log.info("Reloading items to buckets") self.reload_data_into_buckets() ######################################################################################################################### self.log.info("Step 5: Rebalance in with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Enabling autoreprovison before inducing failure to prevent data loss " "for if there are ephemeral buckets") status = self.rest.update_autoreprovision_settings(True, maxNodes=1) if not status: self.fail("Failed to enable autoreprovison") step_count = 9 for action in [ CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_PROMETHEUS ]: step_count = step_count + 1 self.log.info("Step {0}: {1}".format(step_count, action)) self.log.info("Forcing durability level: MAJORITY") self.durability_level = "MAJORITY" task = self.data_load_collection() self.induce_and_revert_failure(action) # Rebalance is required after error is reverted rebalance_task = self.task.async_rebalance( self.cluster.servers, [], [], retry_get_process_num=200) self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() self.durability_level = "" ######################################################################################################################## step_count = 11 for failover in ["Graceful", "Hard"]: for action in [ "RebalanceOut", "FullRecovery", "DeltaRecovery" ]: step_count = step_count + 1 self.log.info( "Step {0}: {1} Failover a node and {2} that node with data load in parallel" .format(step_count, failover, action)) self.std_vbucket_dist = self.input.param( "std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 kv_nodes = self.cluster_util.get_kv_nodes() self.log.info( "Collecting pre_failover_stats. KV nodes are {0}". format(kv_nodes)) prev_failover_stats = self.bucket_util.get_failovers_logs( kv_nodes, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( kv_nodes, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( kv_nodes, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes( self.cluster.master) self.chosen = self.cluster_util.pick_nodes( self.cluster.master, howmany=1, exclude_nodes=self.exclude_nodes) reset_flag = False if (not self.durability_level) and failover == "Hard": # Force a durability level to prevent data loss during hard failover self.log.info("Forcing durability level: MAJORITY") self.durability_level = "MAJORITY" reset_flag = True task = self.data_load_collection() if reset_flag: self.durability_level = "" # Mark Node for failover if failover == "Graceful": self.success_failed_over = self.rest.fail_over( self.chosen[0].id, graceful=True) else: self.success_failed_over = self.rest.fail_over( self.chosen[0].id, graceful=False) self.sleep(300) self.wait_for_failover_or_assert(1) # Perform the action if action == "RebalanceOut": self.nodes = self.rest.node_statuses() self.rest.rebalance( otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=False), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) else: if action == "FullRecovery": if self.success_failed_over: self.rest.set_recovery_type( otpNode=self.chosen[0].id, recoveryType="full") elif action == "DeltaRecovery": if self.success_failed_over: self.rest.set_recovery_type( otpNode=self.chosen[0].id, recoveryType="delta") rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], [], retry_get_process_num=200) self.wait_for_rebalance_to_complete(rebalance_task) self.sleep(10) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() kv_nodes = self.cluster_util.get_kv_nodes() self.log.info( "Collecting post_failover_stats. KV nodes are {0}". format(kv_nodes)) self.bucket_util.compare_failovers_logs( prev_failover_stats, kv_nodes, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, kv_nodes, self.bucket_util.buckets, path=None) self.bucket_util.vb_distribution_analysis( servers=kv_nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] # Bring back the rebalance out node back to cluster for further steps if action == "RebalanceOut": self.sleep(120) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.wait_for_rebalance_to_complete(rebalance_task) self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 18: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) rebalance_task = self.task.async_rebalance( self.cluster.servers, [], [], retry_get_process_num=200) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 19: Flush bucket(s) and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush buckets(s) self.bucket_util.flush_all_buckets( self.cluster.master, skip_resetting_num_items=True) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample( self.nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out, retry_get_process_num=200) self.wait_for_rebalance_to_complete(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) else: if self.number_of_indexes > 0: # Join query thread self.query_thread_flag = False self.query_thread.join() self.query_thread = None # Join ui_stats thread self.ui_stats_thread_flag = False self.ui_stats_thread.join() self.ui_stats_thread = None self.log.info("Volume Test Run Complete")
def Volume(self): ####################################################################### self.log.info("Step1: Create a n node cluster") if self.nodes_init > 1: nodes_init = self.cluster.servers[1:self.nodes_init] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) ####################################################################### self.log.info("Step 2 & 3: Create required buckets.") self.bucket = self.create_required_buckets() self.loop = 0 scope_name = "VolumeScope" collection_prefix = "VolumeCollection" self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": scope_name}) for i in range(self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection '%s::%s'" % (scope_name, collection_name)) self.bucket_util.create_collection(self.cluster.master, self.bucket, scope_name, {"name": collection_name}) self.sleep(2) ####################################################################### while self.loop < self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.bucket_util.add_rbac_user() self.generate_docs(doc_ops="create") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) for task in tasks_info: self.task.jython_task_manager.get_task_result(task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) self.create_perc = self.input.param("create_perc", 100) ################################################################### self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs(doc_ops="create") self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.stop_process() self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info( "Step 11: Failover a node and RebalanceOut that node \ with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() self.nodes = self.rest.node_statuses() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) self.task.jython_task_manager.get_task_result(rebalance_task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 12: Failover a node and FullRecovery\ that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 13: Failover a node and DeltaRecovery that \ node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 15: Flush the bucket and \ start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: nodes_cluster = self.cluster.nodes_in_cluster[:] nodes_cluster.remove(self.cluster.master) servs_out = random.sample( nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) self.task.jython_task_manager.get_task_result( rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.get_bucket_dgm(self.bucket) else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(self.bucket)
def test_volume_taf(self): self.loop = 0 # self.set_metadata_purge_interval() while self.loop < self.iterations: self.log.info("Finished steps 1-4 successfully in setup") self.log.info("Step 5: Rebalance in with Loading of docs") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## if self.contains_ephemeral: self.log.info("No Memcached kill for ephemeral bucket") else: self.log.info( "Step 10: Stopping and restarting memcached process") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") rebalance_task = self.task.async_rebalance( self.cluster.servers, [], [], retry_get_process_num=100) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.stop_process() if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## step_count = 10 for failover in ["Graceful", "Hard"]: for action in [ "RebalanceOut", "FullRecovery", "DeltaRecovery" ]: step_count = step_count + 1 self.log.info( "Step {0}: {1} Failover a node and {2} that node with data load in parallel" .format(step_count, failover, action)) if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") self.std_vbucket_dist = self.input.param( "std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes( self.cluster.master) self.chosen = self.cluster_util.pick_nodes( self.cluster.master, howmany=1) if self.data_load_stage == "during": task = self.data_load_collection() # Mark Node for failover if failover == "Graceful": self.success_failed_over = self.rest.fail_over( self.chosen[0].id, graceful=True) else: self.success_failed_over = self.rest.fail_over( self.chosen[0].id, graceful=False) self.sleep(300) self.wait_for_failover_or_assert(1) # Perform the action if action == "RebalanceOut": self.nodes = self.rest.node_statuses() self.rest.rebalance( otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=False), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) else: if action == "FullRecovery": if self.success_failed_over: self.rest.set_recovery_type( otpNode=self.chosen[0].id, recoveryType="full") elif action == "DeltaRecovery": if self.success_failed_over: self.rest.set_recovery_type( otpNode=self.chosen[0].id, recoveryType="delta") rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], [], retry_get_process_num=100) self.wait_for_rebalance_to_complete(rebalance_task) self.sleep(10) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster( self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] # Bring back the rebalance out node back to cluster for further steps if action == "RebalanceOut": self.sleep(120) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.wait_for_rebalance_to_complete(rebalance_task) self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 17: Updating the bucket replica to 1") if self.data_load_stage == "before": task = self.data_load_collection(async_load=False) if task.result is False: self.fail("Doc loading failed") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) rebalance_task = self.task.async_rebalance( self.cluster.servers, [], [], retry_get_process_num=100) if self.data_load_stage == "during": task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) if self.data_load_stage == "during": self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 18: Flush the bucket and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample( self.nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out, retry_get_process_num=100) self.wait_for_rebalance_to_complete(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) else: self.log.info("Volume Test Run Complete")
def test_volume_taf(self): self.loop = 0 while self.loop < self.iterations: self.log.info("Finished steps 1-4 successfully in setup") self.log.info("Step 5: Rebalance in with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## if "ephemeral" in self.bucket_type: self.log.info("No Memcached kill for epehemral bucket") else: self.log.info( "Step 10: Stopping and restarting memcached process") rebalance_task = self.task.async_rebalance( self.cluster.servers, [], []) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.stop_process() self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 11: Failover a node and RebalanceOut that node with loading in parallel" ) self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover task = self.data_load_collection() self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 12: Failover a node and FullRecovery that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) task = self.data_load_collection() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.sleep(10) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 13: Failover a node and DeltaRecovery that node with loading in parallel" ) self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) task = self.data_load_collection() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.sleep(10) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) task = self.data_load_collection() self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "rebalance failed, stuck or did not complete") self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() self.tasks = [] self.bucket_util.print_bucket_stats() ######################################################################################################################## self.log.info( "Step 15: Flush the bucket and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample( self.nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) self.task.jython_task_manager.get_task_result( rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.assertTrue( rebalance_task.result, "rebalance failed, stuck or did not complete") else: self.log.info("Volume Test Run Complete")
def common_test_body(self, failover_reason, rebalance_type=None): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Variable to decide the durability outcome durability_will_fail = False # Variable to track the number of nodes failed num_nodes_failed = 1 # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 \ and (self.graceful or self.recoveryType is not None): self.log.error( "Can't apply graceful failover to nodes with version < 3.*") self.log.error("Please check configuration params: SKIPPING TEST") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and # check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validation record_static_data_set = {} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) target_bucket = self.bucket_util.buckets[0] # Update new_replica value, if provided in the conf if self.new_replica: self.num_replicas = self.new_replica bucket_helper = BucketHelper(self.master) bucket_helper.change_bucket_props(target_bucket.name, replicaNumber=self.num_replicas) # Decide whether the durability is going to fail or not if self.num_failed_nodes >= 1 and self.num_replicas > 1: durability_will_fail = True # Construct target vbucket list from the nodes # which are going to be failed over vbucket_list = list() for target_node in self.chosen: shell_conn = RemoteMachineShellConnection(target_node) cb_stats = Cbstats(shell_conn) vbuckets = cb_stats.vbucket_list(target_bucket.name, self.target_vbucket_type) shell_conn.disconnect() vbucket_list += vbuckets # Code to generate doc_loaders that will work on vbucket_type # based on targeted nodes. This will perform CRUD only on # vbuckets which will be affected by the failover self.gen_create = doc_generator(self.key, self.num_items, self.num_items * 1.5, target_vbucket=vbucket_list) self.gen_update = doc_generator(self.key, self.num_items / 2, self.num_items, target_vbucket=vbucket_list) self.gen_delete = doc_generator(self.key, self.num_items / 4, self.num_items / 2 - 1, target_vbucket=vbucket_list) self.afterfailover_gen_create = doc_generator( self.key, self.num_items * 1.6, self.num_items * 2, target_vbucket=vbucket_list) self.afterfailover_gen_update = doc_generator( self.key, 1, self.num_items / 4, target_vbucket=vbucket_list) self.afterfailover_gen_delete = doc_generator( self.key, self.num_items * 0.5, self.num_items * 0.75, target_vbucket=vbucket_list) # Perform Add Back Operation with Rebalance # or only Rebalance with verifications if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.failover_onebyone: # Reset it back to False durability_will_fail = False for node_chosen in self.chosen: if num_nodes_failed > 1: durability_will_fail = True if self.add_back_flag: # In add-back case, durability should never fail, since # the num_nodes in the cluster will remain the same self.run_add_back_operation_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) num_nodes_failed += 1 else: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) else: return # Will verify_unacked_bytes only if the durability is not going to fail if self.during_ops is None and not durability_will_fail: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list)
def test_volume_taf(self): ######################################################################################################################## self.log.info("Step1: Create a n node cluster") nodes_init = self.cluster.servers[1:self.nodes_init] if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.query_node = self.cluster.master ######################################################################################################################## self.log.info("Step 2 & 3: Create required buckets.") bucket = self.create_required_buckets() self.loop = 0 ####################################################################################################################### while self.loop<self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.start = 0 self.bucket_util.add_rbac_user() self.end = self.initial_load_count = self.input.param("initial_load", 1000) initial_load = doc_generator("Users", self.start, self.start + self.initial_load_count, doc_size=self.doc_size) self.initial_data_load(initial_load) self.tasks = [] self.bucket_util.print_bucket_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs() self.gen_delete_users=None self._iter_count = 0 if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 1, nodes_out = 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 0, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 2, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in =1, nodes_out= 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## if "ephemeral" in self.bucket_type: self.log.info("No Memcached kill for epehemral bucket") else: self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.stop_process() self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 11: Failover a node and RebalanceOut that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load() self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) self.nodes = self.rest.node_statuses() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [node for node in self.cluster.servers if node.ip == self.chosen[0].ip] self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 12: Failover a node and FullRecovery that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 13: Failover a node and DeltaRecovery that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 15: Flush the bucket and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, int(len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.get_bucket_dgm(bucket) self._iter_count = 0 else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(bucket)
def test_volume_taf(self): self.loop = 0 # self.cluster_utils.set_metadata_purge_interval() if self.number_of_indexes > 0: # start running select queries thread self.query_thread = threading.Thread(target=self.run_select_query) self.query_thread_flag = True self.query_thread.start() # Start running ui stats queries thread self.ui_stats_thread = threading.Thread(target=self.run_ui_stats_queries) self.ui_stats_thread_flag = True self.ui_stats_thread.start() self.log.info("Finished steps 1-4 successfully in setup") while self.loop < self.iterations: if self.loop > 0 or self.flush_buckets_before_indexes_creation: self.log.info("Reloading items to buckets") self.reload_data_into_buckets() ##################################################################################################### self.log.info("Step 5: Rebalance in with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ###################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ###################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ##################################################################################################### self.log.info("Step 8: Swap with Loading of docs") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ####################################################################################################### self.log.info("Step 9: Updating the bucket replica to 2 and rebalance-in") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.cluster.buckets)): bucket_helper.change_bucket_props( self.cluster.buckets[i], replicaNumber=2) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ###################################################################################################### self.log.info("Enabling autoreprovison before inducing failure to prevent data loss " "for if there are ephemeral buckets") status = self.rest.update_autoreprovision_settings(True, maxNodes=1) if not status: self.fail("Failed to enable autoreprovison") step_count = 9 for action in [CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_PROMETHEUS]: step_count = step_count + 1 self.log.info("Step {0}: {1}".format(step_count, action)) # TODO Uncomment this after debugging CBQE-6721 # self.log.info("Forcing durability level: MAJORITY") # self.durability_level = "MAJORITY" task = self.data_load_collection() self.induce_and_revert_failure(action) # Rebalance is required after error is reverted rebalance_task = self.task.async_rebalance(self.cluster.servers, [], [], retry_get_process_num=self.retry_get_process_num) self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() self.durability_level = "" ####################################################################################################### step_count = 11 for failover in ["Graceful", "Hard"]: for action in ["RebalanceOut", "FullRecovery", "DeltaRecovery"]: step_count = step_count + 1 self.log.info( "Step {0}: {1} Failover a node and {2} that node with data load in parallel". format(step_count, failover, action)) self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) self.log.info("Collecting pre_failover_stats. KV nodes are {0}".format(kv_nodes)) prev_failover_stats = self.bucket_util.get_failovers_logs(kv_nodes, self.cluster.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(kv_nodes, self.cluster.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util. \ get_and_compare_active_replica_data_set_all(kv_nodes, self.cluster.buckets, path=None) # Pick node(s) for failover failover_nodes = list() kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) for node in kv_nodes: if node.ip != self.cluster.master.ip: failover_nodes.append(node) break reset_flag = False if (not self.durability_level) and failover == "Hard": # Force a durability level to prevent data loss during hard failover # TODO Uncomment this after debugging CBQE-6721 # self.log.info("Forcing durability level: MAJORITY") # self.durability_level = "MAJORITY" reset_flag = True task = self.data_load_collection() if reset_flag: self.durability_level = "" # Failover the node(s) if failover == "Graceful": failover_result = self.task.failover(self.cluster.nodes_in_cluster, failover_nodes=failover_nodes, graceful=True, wait_for_pending=120, all_at_once=True) else: failover_result = self.task.failover(self.cluster.nodes_in_cluster, failover_nodes=failover_nodes, graceful=False, wait_for_pending=120, all_at_once=True) self.assertTrue(failover_result, "Failover Failed") # Perform the action if action == "RebalanceOut": rebalance_task = self.task.async_rebalance(self.cluster.nodes_in_cluster, [], failover_nodes, retry_get_process_num= self.retry_get_process_num) self.wait_for_rebalance_to_complete(rebalance_task) self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(failover_nodes)) for node in failover_nodes: self.available_servers.append(node) self.sleep(10) else: if action == "FullRecovery": for failover_node in failover_nodes: self.rest.set_recovery_type(otpNode='ns_1@' + failover_node.ip, recoveryType="full") elif action == "DeltaRecovery": for failover_node in failover_nodes: self.rest.set_recovery_type(otpNode='ns_1@' + failover_node.ip, recoveryType="delta") rebalance_task = self.task.async_rebalance(self.cluster.nodes_in_cluster, [], [], retry_get_process_num= self.retry_get_process_num) self.wait_for_rebalance_to_complete(rebalance_task) self.sleep(10) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) self.log.info("Collecting post_failover_stats. KV nodes are {0}".format(kv_nodes)) self.bucket_util.compare_failovers_logs(prev_failover_stats, kv_nodes, self.cluster.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, kv_nodes, self.cluster.buckets, path=None) self.bucket_util.vb_distribution_analysis( self.cluster, servers=kv_nodes, buckets=self.cluster.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster.vbuckets) self.sleep(10) # Bring back the rebalance out node back to cluster for further steps if action == "RebalanceOut": self.sleep(120) self.log.info("Rebalancing-in a node") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.wait_for_rebalance_to_complete(rebalance_task) if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ##################################################################################################### self.log.info("Step 18: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.cluster.buckets)): bucket_helper.change_bucket_props( self.cluster.buckets[i], replicaNumber=1) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], [], retry_get_process_num=self.retry_get_process_num) task = self.data_load_collection() self.wait_for_rebalance_to_complete(rebalance_task) self.wait_for_async_data_load_to_complete(task) self.data_validation_collection() if self.fts_indexes_to_recreate > 0: self.create_and_drop_fts_indexes(count=self.fts_indexes_to_recreate) self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ####################################################################################################### self.cluster.nodes_in_cluster = self.cluster.servers step_count = 19 removed_nodes = list() # total list of all nodes that will be removed if self.perform_quorum_failover: self.log.info("Step {0}: Quorum failover nodes".format(step_count)) # keep performing QF until one node is left while len(self.cluster.nodes_in_cluster) != 1: majority_number = int(math.ceil(len(self.cluster.nodes_in_cluster) / 2.0)) self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) remove_nodes = random.sample(self.nodes_cluster, majority_number) self.custom_induce_failure(remove_nodes, failover_action="stop_server") self.sleep(10, "Wait after inducing failure") self.log.info("Failing over nodes explicitly {0}".format(remove_nodes)) result = self.task.failover(self.cluster.nodes_in_cluster, failover_nodes=remove_nodes, graceful=False, wait_for_pending=120, allow_unsafe=True, all_at_once=True) self.assertTrue(result, "Failover Failed") self.custom_remove_failure(nodes=remove_nodes, revert_failure="stop_server") self.sleep(15) self.wipe_config_on_removed_nodes(remove_nodes) for node in remove_nodes: removed_nodes.append(node) self.cluster.nodes_in_cluster = [node for node in self.cluster.nodes_in_cluster if node not in remove_nodes] # add back all the nodes with kv service rebalance_task = self.task.async_rebalance(self.cluster.nodes_in_cluster, removed_nodes, [], retry_get_process_num=self.retry_get_process_num) self.wait_for_rebalance_to_complete(rebalance_task) self.cluster.nodes_in_cluster = self.cluster.servers[:] step_count = step_count + 1 self.bucket_util.print_bucket_stats(self.cluster) self.check_logs() ##################################################################################################### self.log.info("Step {0}: Flush bucket(s) and start the entire process again".format(step_count)) self.loop += 1 if self.loop < self.iterations: # Flush bucket(s) self.bucket_util.flush_all_buckets( self.cluster, skip_resetting_num_items=True) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, int(len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out, retry_get_process_num=self.retry_get_process_num) self.wait_for_rebalance_to_complete(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) else: if self.number_of_indexes > 0: self.close_all_threads() self.log.info("Volume Test Run Complete")