def create(self, action=None, bucket_name="default"): self.log.info("Simulating '{0}' in {1}".format(action, self.shell_conn.ip)) if action == CouchbaseError.STOP_MEMCACHED: _, error = self.__interrupt_process("memcached", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_MEMCACHED: _, error = self.__interrupt_process("memcached", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_PROMETHEUS: _, error = self.__interrupt_process("prometheus", "stop") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_PROMETHEUS: _, error = self.__interrupt_process("prometheus", "kill") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_SERVER: self.shell_conn.stop_server() elif action == CouchbaseError.STOP_PERSISTENCE: cbepctl_obj = Cbepctl(self.shell_conn) cbepctl_obj.persistence(bucket_name, "stop") else: self.log.error("Unsupported action: '{0}'".format(action))
def test_rollback_and_persistence_race_condition(self): cluster = self.cluster gen_load = doc_generator(self.key, 0, self.num_items) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) # Stop persistence for server in cluster.servers[:self.nodes_init]: # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(server) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "stop") # Disconnect the shell_connection node_shell_conn.disconnect() self.sleep(10, "Wait after stop_persistence") # more (non-intersecting) load gen_load = doc_generator(self.key, 0, self.num_items, doc_size=64) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) shell = RemoteMachineShellConnection(cluster.servers[0]) shell.kill_memcached() self.sleep(10, "Wait after kill memcached") node1_shell_conn = RemoteMachineShellConnection(cluster.servers[0]) node2_shell_conn = RemoteMachineShellConnection(cluster.servers[1]) node1_cb_stat_obj = Cbstats(node1_shell_conn) node2_cb_stat_obj = Cbstats(node2_shell_conn) node1_items = node1_cb_stat_obj.all_stats(bucket, "curr_items_tot") node2_items = node2_cb_stat_obj.all_stats(bucket, "curr_items_tot") # Disconnect the opened connections node1_shell_conn.disconnect() node2_shell_conn.disconnect() self.assertTrue(node1_items == node2_items, 'Node items not equal. Node 1:{0}, node 2:{1}' .format(node1_items, node2_items))
def test_rollback_to_zero(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 10000) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") start = self.num_items shell = self.node_shells[self.cluster.master]["shell"] cbstats = self.node_shells[self.cluster.master]["cbstat"] self.target_vbucket = cbstats.vbucket_list(self.bucket.name) # Stopping persistence on NodeA cbepctl = Cbepctl(shell) cbepctl.persistence(self.bucket.name, "stop") for i in xrange(1, self.num_rollbacks + 1): self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.load_docs() if self.rollback_with_multiple_mutation: self.doc_ops = "update" self.load_docs() start = self.gen_create.key_counter stat_map = {self.cluster.nodes_in_cluster[0]: mem_only_items * i} for node in self.cluster.nodes_in_cluster[1:]: stat_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, stat_map) self.sleep(60) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback") shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket, wait_time=self.wait_timeout * 10)) self.bucket_util.verify_stats_all_buckets(items) shell.disconnect() self.validate_test_failure()
def revert(self, action=None, bucket_name="default"): self.log.info("Reverting '{0}' in {1}".format(action, self.shell_conn.ip)) if action == CouchbaseError.STOP_MEMCACHED: _, error = self.__interrupt_process("memcached", "resume") self.__handle_shell_error(error) elif action == CouchbaseError.STOP_BEAMSMP: _, error = self.__interrupt_process("beam.smp", "resume") self.__handle_shell_error(error) elif action == CouchbaseError.KILL_BEAMSMP \ or action == CouchbaseError.STOP_SERVER: self.shell_conn.start_server() elif action == CouchbaseError.STOP_PERSISTENCE: cbepctl_obj = Cbepctl(self.shell_conn) cbepctl_obj.persistence(bucket_name, "start") else: self.log.error( "Unsupported action to revert: '{0}'".format(action))
def replicate_correct_data_after_rollback(self): ''' @attention: This test case has some issue with docker runs. It passes without any issue on VMs. ''' bucket = self.bucket_util.buckets[0] cluster = self.cluster gen_load = doc_generator(self.key, 0, self.num_items) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) # store the KVs which were modified and active on node 1 modified_kvs_active_on_node1 = dict() vbucket_client = VBucketAwareMemcached( RestConnection(cluster.master), bucket.name) client = MemcachedClientHelper.direct_client(cluster.servers[0], bucket.name) for i in range(self.num_items/100): keyname = 'keyname-' + str(i) vbId = self.bucket_util.get_vbucket_num_for_key(keyname, self.vbuckets) if vbucket_client.vBucketMap[vbId].split(':')[0] == cluster.servers[0].ip: rc = client.get(keyname) modified_kvs_active_on_node1[keyname] = rc[2] # Stop persistence for server in cluster.servers[:self.nodes_init]: # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(server) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "stop") # Disconnect the shell_connection node_shell_conn.disconnect() # modify less than 1/2 of the keys gen_load = doc_generator(self.key, 0, self.num_items/100) rc = self.cluster.load_gen_docs( cluster.servers[0], bucket.name, gen_load, bucket.kvs[1], "create", exp=0, flag=0, batch_size=10, compression=self.sdk_compression) # kill memcached, when it comes back because persistence is disabled # it will have lost the second set of mutations shell = RemoteMachineShellConnection(cluster.servers[0]) shell.kill_memcached() self.sleep(10, "Sleep after kill memcached") # Start persistence on the second node # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(cluster.servers[1]) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "start") # Disconnect the shell_connection node_shell_conn.disconnect() self.sleep(10, "Sleep after start persistence") # failover to the second node rc = self.cluster.failover(cluster.servers, cluster.servers[1:2], graceful=True) self.sleep(30, "Sleep after node failover triggered") # Values should be what they were prior to the second update client = MemcachedClientHelper.direct_client( cluster.servers[0], bucket.name) for k, v in modified_kvs_active_on_node1.iteritems(): rc = client.get(k) self.assertTrue(v == rc[2], 'Expected {0}, actual {1}' .format(v, rc[2])) # need to rebalance the node back into the cluster # def rebalance(self, servers, to_add, to_remove, timeout=None, # use_hostnames=False, services = None): rest_obj = RestConnection(cluster.servers[0]) nodes_all = rest_obj.node_statuses() for node in nodes_all: if node.ip == cluster.servers[1].ip: break node_id_for_recovery = node.id status = rest_obj.add_back_node(node_id_for_recovery) if status: rest_obj.set_recovery_type(node_id_for_recovery, recoveryType='delta') rc = self.cluster.rebalance(cluster.servers[:self.nodes_init], [], [])
def test_rollback_n_times(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 100) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") shell = self.node_shells[self.cluster.master]["shell"] cbstats = self.node_shells[self.cluster.master]["cbstat"] self.target_vbucket = cbstats.vbucket_list(self.bucket.name) start = self.num_items self.gen_validate = self.gen_create for _ in xrange(1, self.num_rollbacks + 1): # Stopping persistence on NodeA cbepctl = Cbepctl(shell) cbepctl.persistence(self.bucket.name, "stop") self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.load_docs() if self.rollback_with_multiple_mutation: self.doc_ops = "update" self.load_docs() start = self.gen_create.key_counter ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_only_items } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size") # Kill memcached on NodeA to trigger rollback on other Nodes # replica vBuckets for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket, wait_time=self.wait_timeout * 10)) self.sleep(10, "Wait after warmup complete. Not required !!") self.bucket_util.verify_stats_all_buckets(items, timeout=300) for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) data_validation = self.task.async_validate_docs(self.cluster, self.bucket, self.gen_validate, "create", 0, batch_size=10) self.task.jython_task_manager.get_task_result(data_validation) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback") self.validate_test_failure()
def test_ttl_less_than_durability_timeout(self): """ MB-43238 1. Regular write with TTL 1 second for some key 2. Disable expiry pager (to prevent raciness) 3. Wait TTL period 4. Disable persistence on the node with the replica vBucket for that key 5. SyncWrite PersistMajority to active vBucket for that key (should hang) 6. Access key on other thread to trigger expiry 7. Observe DCP connection being torn down without fix """ def perform_sync_write(): client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, durability=Bucket.DurabilityLevel.PERSIST_TO_MAJORITY, timeout=60) doc_ttl = 5 target_node = None key = "test_ttl_doc" vb_for_key = self.bucket_util.get_vbucket_num_for_key(key) bucket = self.cluster.buckets[0] # Find target node for replica VB for target_node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(target_node) if vb_for_key in cb_stats.vbucket_list(bucket.name, "replica"): break self.log.info("Target node: %s, Key: %s" % (target_node.ip, key)) self.log.info("Disabling expiry_pager") shell = RemoteMachineShellConnection(target_node) cb_ep_ctl = Cbepctl(shell) cb_ep_ctl.set(bucket.name, "flush_param", "exp_pager_stime", 0) # Create SDK client client = SDKClient([self.cluster.master], bucket) self.log.info("Non-sync write with TTL=%s" % doc_ttl) client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, exp=doc_ttl) self.sleep(doc_ttl, "Wait for document to expire") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.log.info("Stopping persistence on replica VB node using cbepctl") cb_ep_ctl.persistence(bucket.name, "stop") # Start doc_load with lesser ttl doc_create_thread = Thread(target=perform_sync_write) doc_create_thread.start() self.sleep(2, "Wait for sync_write thread to start") self.log.info("Read key from another thread to trigger expiry") failure = None result = client.crud(DocLoading.Bucket.DocOps.READ, key) if SDKException.DocumentNotFoundException not in str(result["error"]): failure = "Invalid exception: %s" % result["error"] self.log.info("Resuming persistence on target node") cb_ep_ctl.persistence(bucket.name, "start") # Wait for doc_create_thread to complete doc_create_thread.join() # Close SDK client and shell connections client.close() shell.disconnect() if failure: self.fail(failure) for node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(node).all_stats(bucket.name) self.log.info("Node: %s, ep_expired_access: %s" % (node.ip, cb_stats["ep_expired_access"])) self.assertEqual(int(cb_stats["ep_expired_access"]), 0, "%s: ep_expired_access != 0" % node.ip)