def create(self, action=None, bucket_name="default"): self.log.info("Simulating '{0}' in {1}".format(action, self.shell_conn.ip)) if action == CouchbaseError.STOP_MEMCACHED: _, error = self.__interrupt_process("memcached", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_MEMCACHED: _, error = self.__interrupt_process("memcached", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_PROMETHEUS: _, error = self.__interrupt_process("prometheus", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_PROMETHEUS: _, error = self.__interrupt_process("prometheus", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_SERVER: self.shell_conn.stop_server() elif action == CouchbaseError.STOP_PERSISTENCE: cbepctl_obj = Cbepctl(self.shell_conn) cbepctl_obj.persistence(bucket_name, "stop") else: self.log.error("Unsupported action: '{0}'".format(action))
def test_rollback_and_persistence_race_condition(self): cluster = self.cluster gen_load = doc_generator(self.key, 0, self.num_items) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) # Stop persistence for server in cluster.servers[:self.nodes_init]: # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(server) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "stop") # Disconnect the shell_connection node_shell_conn.disconnect() self.sleep(10, "Wait after stop_persistence") # more (non-intersecting) load gen_load = doc_generator(self.key, 0, self.num_items, doc_size=64) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) shell = RemoteMachineShellConnection(cluster.servers[0]) shell.kill_memcached() self.sleep(10, "Wait after kill memcached") node1_shell_conn = RemoteMachineShellConnection(cluster.servers[0]) node2_shell_conn = RemoteMachineShellConnection(cluster.servers[1]) node1_cb_stat_obj = Cbstats(node1_shell_conn) node2_cb_stat_obj = Cbstats(node2_shell_conn) node1_items = node1_cb_stat_obj.all_stats(bucket, "curr_items_tot") node2_items = node2_cb_stat_obj.all_stats(bucket, "curr_items_tot") # Disconnect the opened connections node1_shell_conn.disconnect() node2_shell_conn.disconnect() self.assertTrue(node1_items == node2_items, 'Node items not equal. Node 1:{0}, node 2:{1}' .format(node1_items, node2_items))
def test_rollback_to_zero(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 10000) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") start = self.num_items shell = self.node_shells[self.cluster.master]["shell"] cbstats = self.node_shells[self.cluster.master]["cbstat"] self.target_vbucket = cbstats.vbucket_list(self.bucket.name) # Stopping persistence on NodeA cbepctl = Cbepctl(shell) cbepctl.persistence(self.bucket.name, "stop") for i in xrange(1, self.num_rollbacks + 1): self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.load_docs() if self.rollback_with_multiple_mutation: self.doc_ops = "update" self.load_docs() start = self.gen_create.key_counter stat_map = {self.cluster.nodes_in_cluster[0]: mem_only_items * i} for node in self.cluster.nodes_in_cluster[1:]: stat_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, stat_map) self.sleep(60) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback") shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket, wait_time=self.wait_timeout * 10)) self.bucket_util.verify_stats_all_buckets(items) shell.disconnect() self.validate_test_failure()
def revert(self, action=None, bucket_name="default"): self.log.info("Reverting '{0}' in {1}".format(action, self.shell_conn.ip)) if action == CouchbaseError.STOP_MEMCACHED: _, error = self.__interrupt_process("memcached", "resume") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "resume") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_BEAMSMP \ or action == CouchbaseError.STOP_SERVER: self.shell_conn.start_server() elif action == CouchbaseError.STOP_PERSISTENCE: cbepctl_obj = Cbepctl(self.shell_conn) cbepctl_obj.persistence(bucket_name, "start") else: self.log.error( "Unsupported action to revert: '{0}'".format(action))
def test_flush_bucket_during_data_persistence(self): self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") count = 0 start = copy.deepcopy(self.init_items_per_collection) while count < self.test_itr: self.log.info("Iteration {}".format(count + 1)) self.compute_docs(start, start) for shell in self.shell_conn: for bucket in self.cluster.buckets: Cbepctl(shell).persistence(bucket.name, "stop") self.generate_docs() tasks_info = dict() for scope in self.scopes: for collection in self.collections: task_info = self.loadgen_docs( self.retry_exceptions, self.ignore_exceptions, scope=scope, collection=collection, suppress_error_table=True, skip_read_on_error=True, _sync=False, doc_ops=self.doc_ops, track_failures=False, sdk_retry_strategy=SDKConstants.RetryStrategy.FAIL_FAST ) tasks_info.update(task_info.items()) for task in tasks_info: self.task_manager.get_task_result(task) for shell in self.shell_conn: for bucket in self.cluster.buckets: Cbepctl(shell).persistence(bucket.name, "start") self.sleep(10, "sleep before flush thread") for bucket in self.buckets: self.bucket_util.flush_bucket(self.cluster, bucket) count += 1
def test_rollback_to_zero(self): if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") keys_to_verify = ["high_completed_seqno", "purge_seqno"] doc_loading_task_2 = None # Override num_items to load data into each collection self.num_items = 10000 # Set values to num_items to support loading through # collection loading task for bucket in self.bucket_util.buckets: for _, scope in bucket.scopes.items(): for _, collection in scope.collections.items(): collection.num_items = self.num_items # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") target_node = choice(self.cluster_util.get_kv_nodes()) shell = self.node_shells[target_node]["shell"] cbstats = self.node_shells[target_node]["cbstat"] self.target_vbuckets = cbstats.vbucket_list(self.bucket.name) for i in xrange(1, self.num_rollbacks + 1): self.total_rollback_items = 0 self.log.info("Stopping persistence on %s" % target_node.ip) Cbepctl(shell).persistence(self.bucket.name, "stop") doc_loading_task_1 = self.load_docs(self.doc_ops) if self.rollback_with_multiple_mutation: doc_loading_task_2 = self.load_docs("update") stat_map = dict() for node in self.cluster.nodes_in_cluster: expected_val = 0 if node.ip == target_node.ip: expected_val = self.total_rollback_items if self.sync_write_enabled: # Includes prepare+commit mutation expected_val *= 2 stat_map.update({node: expected_val}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, stat_map, timeout=self.wait_timeout) if doc_loading_task_2: self.__rewind_doc_index(doc_loading_task_2) self.__rewind_doc_index(doc_loading_task_1) self.log.info("Killing memcached to trigger rollback") shell.kill_memcached() self.assertTrue(self.bucket_util._wait_warmup_completed( [target_node], self.bucket, wait_time=300)) self.sleep(10, "Wait after bucket warmup for cbstats to work") self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback", keys_to_verify) # Reset expected values to '0' for validation for bucket in self.bucket_util.buckets: for _, scope in bucket.scopes.items(): for _, collection in scope.collections.items(): collection.num_items = 0 self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_rollback_during_compaction(self): ''' ''' self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") items = copy.deepcopy(self.num_items) mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.num_rollbacks = self.input.param("num_rollbacks", 1) ####################################################################### ''' STEP - 1, Stop persistence on node - x ''' for i in range(1, self.num_rollbacks+1): self.log.info("Roll back Iteration == {}".format(i)) start = items shell = RemoteMachineShellConnection(self.cluster.master) cbstats = Cbstats(self.cluster.master) self.target_vbucket = cbstats.vbucket_list(self.cluster.buckets[0]. name) mem_item_count = 0 # Stopping persistence on Node-x self.log.debug("Stopping persistence on Node-{}" .format(self.cluster.master)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ############################################################### ''' STEP - 3 -- Load documents on node x for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions, ignore_exceptions=self.ignore_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter ep_queue_size_map = {self.cluster.nodes_in_cluster[0]: mem_item_count} if self.durability_level: self.log.info("updating the num_items on disk check to double due to durability") ep_queue_size_map = {self.cluster.nodes_in_cluster[0]: mem_item_count * 2} vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) #for bucket in self.cluster.buckets: # self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, # timeout=1200) # self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, # cbstat_cmd="all", # stat_name="vb_replica_queue_size", # timeout=1200) # replica vBuckets #for bucket in self.cluster.buckets: # self.log.debug(cbstats.failover_stats(bucket.name)) ############################################################### ''' STEP - 4 -- Kill Memcached on master node and trigger rollback on other nodes ''' if self.compact_before: compaction_tasks=[] for bucket in self.cluster.buckets: compaction_tasks.append(self.task.async_compact_bucket(self.cluster.master, bucket)) shell.kill_memcached() if self.compact_after: self.bucket_util._run_compaction(self.cluster, number_of_times=1) if self.compact_before: for task in compaction_tasks: self.task_manager.get_task_result(task) self.assertTrue(self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) ############################################################### ''' STEP -5 -- Restarting persistence on Node -- x ''' self.log.debug("RollBack Iteration=={}, Re-Starting persistence on Node -- {}".format(i, self.cluster.master)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep(5, "Sleep after re-starting persistence, Iteration{}".format(i)) for nod in self.cluster.nodes_in_cluster: ep_queue_size_map.update({nod: 0}) vb_replica_queue_size_map.update({nod: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=600) self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=600) shell.disconnect() ################################################################### ''' STEP - 6 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' self.create_start = items self.create_end = items + 100000 self.generate_docs(doc_ops="create", target_vbucket=None) self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=1200) items = items + 100000 self.log.debug("Iteration == {}, Total num_items {}".format(i, items))
def replicate_correct_data_after_rollback(self): ''' @attention: This test case has some issue with docker runs. It passes without any issue on VMs. ''' bucket = self.bucket_util.buckets[0] cluster = self.cluster gen_load = doc_generator(self.key, 0, self.num_items) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) # store the KVs which were modified and active on node 1 modified_kvs_active_on_node1 = dict() vbucket_client = VBucketAwareMemcached( RestConnection(cluster.master), bucket.name) client = MemcachedClientHelper.direct_client(cluster.servers[0], bucket.name) for i in range(self.num_items/100): keyname = 'keyname-' + str(i) vbId = self.bucket_util.get_vbucket_num_for_key(keyname, self.vbuckets) if vbucket_client.vBucketMap[vbId].split(':')[0] == cluster.servers[0].ip: rc = client.get(keyname) modified_kvs_active_on_node1[keyname] = rc[2] # Stop persistence for server in cluster.servers[:self.nodes_init]: # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(server) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "stop") # Disconnect the shell_connection node_shell_conn.disconnect() # modify less than 1/2 of the keys gen_load = doc_generator(self.key, 0, self.num_items/100) rc = self.cluster.load_gen_docs( cluster.servers[0], bucket.name, gen_load, bucket.kvs[1], "create", exp=0, flag=0, batch_size=10, compression=self.sdk_compression) # kill memcached, when it comes back because persistence is disabled # it will have lost the second set of mutations shell = RemoteMachineShellConnection(cluster.servers[0]) shell.kill_memcached() self.sleep(10, "Sleep after kill memcached") # Start persistence on the second node # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(cluster.servers[1]) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "start") # Disconnect the shell_connection node_shell_conn.disconnect() self.sleep(10, "Sleep after start persistence") # failover to the second node rc = self.cluster.failover(cluster.servers, cluster.servers[1:2], graceful=True) self.sleep(30, "Sleep after node failover triggered") # Values should be what they were prior to the second update client = MemcachedClientHelper.direct_client( cluster.servers[0], bucket.name) for k, v in modified_kvs_active_on_node1.iteritems(): rc = client.get(k) self.assertTrue(v == rc[2], 'Expected {0}, actual {1}' .format(v, rc[2])) # need to rebalance the node back into the cluster # def rebalance(self, servers, to_add, to_remove, timeout=None, # use_hostnames=False, services = None): rest_obj = RestConnection(cluster.servers[0]) nodes_all = rest_obj.node_statuses() for node in nodes_all: if node.ip == cluster.servers[1].ip: break node_id_for_recovery = node.id status = rest_obj.add_back_node(node_id_for_recovery) if status: rest_obj.set_recovery_type(node_id_for_recovery, recoveryType='delta') rc = self.cluster.rebalance(cluster.servers[:self.nodes_init], [], [])
def test_flush_bucket_during_rollback(self): ''' Test focus: Stopping persistence one by one on all nodes, and trigger roll back on other nodes, During rollback flush the data Above step will be done num_rollback (variable defined in test) times STEPS: -- Ensure creation of at least a single state file -- Below steps will be repeated on all nodes, with stopping peristence on one at a time -- Stop persistence on node x -- Start load on node x for a given duration(self.duration * 60 seconds) -- Above step ensures creation of new state files (# equal to self.duration) -- Kill MemCached on Node x -- Trigger roll back on other/replica nodes -- ReStart persistence on Node -x -- Repeat all the above steps for num_rollback times ''' self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") items = copy.deepcopy(self.init_items_per_collection) mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.duration = self.input.param("duration", 2) self.num_rollbacks = self.input.param("num_rollbacks", 3) ####################################################################### ''' STEP - 1, Ensures creation of at least one snapshot To ensure at least one snapshot should get created before rollback starts, we need to sleep for 60 seconds as per magma design which create state file every 60s ''' self.sleep(60, "Ensures creation of at least one snapshot") ####################################################################### ''' STEP - 2, Stop persistence on node - x ''' for i in range(1, self.num_rollbacks+1): self.log.info("Roll back Iteration == {}".format(i)) start = items for x, node in enumerate(self.cluster.nodes_in_cluster): shell = RemoteMachineShellConnection(node) cbstats = Cbstats(shell) self.target_vbucket = cbstats.vbucket_list(self.cluster.buckets[0]. name) mem_item_count = 0 # Stopping persistence on Node-x self.log.debug("Iteration == {}, Stopping persistence on Node-{}, ip ={}" .format(i, x+1, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ############################################################### ''' STEP - 3 -- Load documents on node x for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None time_end = time.time() + 60 * self.duration itr = 0 while time.time() < time_end: itr += 1 time_start = time.time() mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter if time.time() < time_start + 60: self.log.info("Rollback Iteration== {}, itr== {}, Active-Node=={}, Node=={}".format(i, itr, x+1, node)) self.sleep(time_start + 60 - time.time(), "Sleep to ensure creation of state files for roll back") self.log.info("state files == {}".format( self.get_state_files(self.buckets[0]))) ep_queue_size_map = {node: mem_item_count} if self.durability_level: self.log.info("updating the num_items on disk check to double due to durability") ep_queue_size_map = {node: mem_item_count * 2} vb_replica_queue_size_map = {node: 0} for nod in self.cluster.nodes_in_cluster: if nod != node: ep_queue_size_map.update({nod: 0}) vb_replica_queue_size_map.update({nod: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=1200) self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=1200) # replica vBuckets for bucket in self.cluster.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) ############################################################### ''' STEP - 4 -- Kill Memcached on Node - x and trigger rollback on other nodes -- After 20 seconds , flush bucket ''' shell.kill_memcached() self.sleep(20, "sleep after killing memcached") self.bucket_util.flush_bucket(self.cluster, self.cluster.buckets[0]) ############################################################### ''' STEP -5 -- Restarting persistence on Node -- x ''' self.assertTrue(self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) self.log.debug("Iteration=={}, Re-Starting persistence on Node -- {}".format(i, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep(5, "Sleep after re-starting persistence, Iteration{}".format(i)) shell.disconnect() ################################################################### ''' STEP - 6 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' self.create_start = 0 self.create_end = self.init_items_per_collection self.generate_docs(doc_ops="create", target_vbucket=None) self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets, timeout=1200)
def test_rollback_n_times(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 100) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") shell = self.node_shells[self.cluster.master]["shell"] cbstats = self.node_shells[self.cluster.master]["cbstat"] self.target_vbucket = cbstats.vbucket_list(self.bucket.name) start = self.num_items self.gen_validate = self.gen_create for _ in xrange(1, self.num_rollbacks + 1): # Stopping persistence on NodeA cbepctl = Cbepctl(shell) cbepctl.persistence(self.bucket.name, "stop") self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.load_docs() if self.rollback_with_multiple_mutation: self.doc_ops = "update" self.load_docs() start = self.gen_create.key_counter ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_only_items } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size") # Kill memcached on NodeA to trigger rollback on other Nodes # replica vBuckets for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket, wait_time=self.wait_timeout * 10)) self.sleep(10, "Wait after warmup complete. Not required !!") self.bucket_util.verify_stats_all_buckets(items, timeout=300) for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) data_validation = self.task.async_validate_docs(self.cluster, self.bucket, self.gen_validate, "create", 0, batch_size=10) self.task.jython_task_manager.get_task_result(data_validation) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback") self.validate_test_failure()
def test_ttl_less_than_durability_timeout(self): """ MB-43238 1. Regular write with TTL 1 second for some key 2. Disable expiry pager (to prevent raciness) 3. Wait TTL period 4. Disable persistence on the node with the replica vBucket for that key 5. SyncWrite PersistMajority to active vBucket for that key (should hang) 6. Access key on other thread to trigger expiry 7. Observe DCP connection being torn down without fix """ def perform_sync_write(): client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, durability=Bucket.DurabilityLevel.PERSIST_TO_MAJORITY, timeout=60) doc_ttl = 5 target_node = None key = "test_ttl_doc" vb_for_key = self.bucket_util.get_vbucket_num_for_key(key) bucket = self.cluster.buckets[0] # Find target node for replica VB for target_node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(target_node) if vb_for_key in cb_stats.vbucket_list(bucket.name, "replica"): break self.log.info("Target node: %s, Key: %s" % (target_node.ip, key)) self.log.info("Disabling expiry_pager") shell = RemoteMachineShellConnection(target_node) cb_ep_ctl = Cbepctl(shell) cb_ep_ctl.set(bucket.name, "flush_param", "exp_pager_stime", 0) # Create SDK client client = SDKClient([self.cluster.master], bucket) self.log.info("Non-sync write with TTL=%s" % doc_ttl) client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, exp=doc_ttl) self.sleep(doc_ttl, "Wait for document to expire") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.log.info("Stopping persistence on replica VB node using cbepctl") cb_ep_ctl.persistence(bucket.name, "stop") # Start doc_load with lesser ttl doc_create_thread = Thread(target=perform_sync_write) doc_create_thread.start() self.sleep(2, "Wait for sync_write thread to start") self.log.info("Read key from another thread to trigger expiry") failure = None result = client.crud(DocLoading.Bucket.DocOps.READ, key) if SDKException.DocumentNotFoundException not in str(result["error"]): failure = "Invalid exception: %s" % result["error"] self.log.info("Resuming persistence on target node") cb_ep_ctl.persistence(bucket.name, "start") # Wait for doc_create_thread to complete doc_create_thread.join() # Close SDK client and shell connections client.close() shell.disconnect() if failure: self.fail(failure) for node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(node).all_stats(bucket.name) self.log.info("Node: %s, ep_expired_access: %s" % (node.ip, cb_stats["ep_expired_access"])) self.assertEqual(int(cb_stats["ep_expired_access"]), 0, "%s: ep_expired_access != 0" % node.ip)
def test_stream_during_rollback(self): ''' -- Ensure creation of at least a single state file -- Stop persistence on master node -- Start load on master node(say Node A) for a given duration(self.duration * 60 seconds) -- Above step ensures creation of new state files (# equal to self.duration) -- Kill MemCached on master node(Node A) -- Trigger roll back on other/replica nodes -- START STREAMING DATA USING DCP -- ReStart persistence on master node -- Start doc loading on all the nodes(ensure creation of state file) -- Above two steps ensure, roll back to new snapshot -- Repeat all the above steps for num_rollback times -- ''' items = self.num_items mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.duration = self.input.param("duration", 2) self.num_rollbacks = self.input.param("num_rollbacks", 3) shell = RemoteMachineShellConnection(self.cluster.master) cbstats = Cbstats(self.cluster.master) self.target_vbucket = cbstats.vbucket_list( self.cluster.buckets[0].name) ####################################################################### ''' STEP - 1, Stop persistence on master node ''' master_itr = 0 for i in range(1, self.num_rollbacks + 1): start = items self.log.info("Roll back Iteration == {}".format(i)) mem_item_count = 0 # Stopping persistence on NodeA self.log.debug("Iteration == {}, stopping persistence".format(i)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ################################################################### ''' STEP - 2 -- Doc ops on master node for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.log.info("Just before compute docs, iteration {}".format(i)) self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None time_end = time.time() + 60 * self.duration while time.time() < time_end: master_itr += 1 time_start = time.time() mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter if time.time() < time_start + 60: self.sleep( time_start + 60 - time.time(), "master_itr == {}, Sleep to ensure creation of state files for roll back," .format(master_itr)) self.log.info("master_itr == {}, state files== {}".format( master_itr, self.get_state_files(self.buckets[0]))) ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_item_count } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=300) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=300) # replica vBuckets for bucket in self.cluster.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) ################################################################### ''' STEP - 3 -- Kill Memcached on master node(Node A) and trigger rollback on replica/other nodes -- Start streaming data (through DCP) ''' shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) output_string = self.dcp_util.get_dcp_event() actual_item_count = len( list(filter(lambda x: 'CMD_MUTATION' in x, output_string))) self.log.info("actual_item_count is {}".format(actual_item_count)) msg = "item count mismatch, expected {} actual {}" self.assertIs(actual_item_count == self.num_items, True, msg.format(self.num_items, actual_item_count)) ################################################################### ''' STEP -4 -- Restarting persistence on master node(Node A) ''' self.log.debug("Iteration=={}, Re-Starting persistence".format(i)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep( 5, "Iteration=={}, sleep after restarting persistence".format(i)) ################################################################### ''' STEP - 5 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' if i != self.num_rollbacks: self.create_start = items self.create_end = items + 50000 self.generate_docs(doc_ops="create", target_vbucket=None) _ = self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=1200) items = items + 50000 self.log.debug("Iteration == {}, Total num_items {}".format( i, items)) shell.disconnect()