def collect_failovers_stats(self, buckets, servers, perNode=True): """ Method to extract the failovers stats given by cbstats tool Paramters: buckets: bucket informaiton servers: server information perNode: if set collect per node information else all Returns: Failover stats as follows: if not collecting per node :: {bucket : [{key:value}]} if collecting per node :: {bucket : {node:[{key:value}]}} """ bucketMap = {} for bucket in buckets: bucketMap[bucket.name] = {} for bucket in buckets: dataMap = {} for server in servers: #client = MemcachedClientHelper.direct_client(server, bucket) #stats = client.stats('failovers') cbstat = Cbstats(server) stats = cbstat.failover_stats(bucket.name) map_data = {} num_map = {} for okey, ovalue in stats.items(): vb = 'vb_' + okey for ikey, ivalue in ovalue.items(): tokens = ikey.split(":") key = tokens[0] num = -1 if len(tokens) == 2: key = tokens[1] num = int(tokens[0]) value = ivalue.split() if vb in map_data.keys() and \ (num == num_map[vb] or num > num_map[vb]): map_data[vb][key] = value[0] elif vb in map_data.keys() and key == "num_entries": map_data[vb][key] = value[0] elif vb not in map_data.keys(): m = {} m[key] = value[0] map_data[vb] = m num_map[vb] = num # for o in stats.keys(): # tokens = o.split(":") # vb = tokens[0] # key = tokens[1] # value = stats[o].split() # num = -1 # if len(tokens) == 3: # vb = tokens[0] # num = int(tokens[1]) # key = tokens[2] # if vb in map_data.keys() and \ # (num == num_map[vb] or num > num_map[vb]): # map_data[vb][key] = value[0] # num_map[vb] = num # elif vb in map_data.keys() and key == "num_entries": # map_data[vb][key] = value[0] # elif vb not in map_data.keys(): # m = {} # m[key] = value[0] # map_data[vb] = m # num_map[vb] = num if perNode: dataMap[server.ip] = map_data else: dataMap.update(map_data) bucketMap[bucket.name] = dataMap return bucketMap
def test_flush_bucket_during_rollback(self): ''' Test focus: Stopping persistence one by one on all nodes, and trigger roll back on other nodes, During rollback flush the data Above step will be done num_rollback (variable defined in test) times STEPS: -- Ensure creation of at least a single state file -- Below steps will be repeated on all nodes, with stopping peristence on one at a time -- Stop persistence on node x -- Start load on node x for a given duration(self.duration * 60 seconds) -- Above step ensures creation of new state files (# equal to self.duration) -- Kill MemCached on Node x -- Trigger roll back on other/replica nodes -- ReStart persistence on Node -x -- Repeat all the above steps for num_rollback times ''' self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") items = copy.deepcopy(self.init_items_per_collection) mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.duration = self.input.param("duration", 2) self.num_rollbacks = self.input.param("num_rollbacks", 3) ####################################################################### ''' STEP - 1, Ensures creation of at least one snapshot To ensure at least one snapshot should get created before rollback starts, we need to sleep for 60 seconds as per magma design which create state file every 60s ''' self.sleep(60, "Ensures creation of at least one snapshot") ####################################################################### ''' STEP - 2, Stop persistence on node - x ''' for i in range(1, self.num_rollbacks+1): self.log.info("Roll back Iteration == {}".format(i)) start = items for x, node in enumerate(self.cluster.nodes_in_cluster): shell = RemoteMachineShellConnection(node) cbstats = Cbstats(shell) self.target_vbucket = cbstats.vbucket_list(self.cluster.buckets[0]. name) mem_item_count = 0 # Stopping persistence on Node-x self.log.debug("Iteration == {}, Stopping persistence on Node-{}, ip ={}" .format(i, x+1, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ############################################################### ''' STEP - 3 -- Load documents on node x for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None time_end = time.time() + 60 * self.duration itr = 0 while time.time() < time_end: itr += 1 time_start = time.time() mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter if time.time() < time_start + 60: self.log.info("Rollback Iteration== {}, itr== {}, Active-Node=={}, Node=={}".format(i, itr, x+1, node)) self.sleep(time_start + 60 - time.time(), "Sleep to ensure creation of state files for roll back") self.log.info("state files == {}".format( self.get_state_files(self.buckets[0]))) ep_queue_size_map = {node: mem_item_count} if self.durability_level: self.log.info("updating the num_items on disk check to double due to durability") ep_queue_size_map = {node: mem_item_count * 2} vb_replica_queue_size_map = {node: 0} for nod in self.cluster.nodes_in_cluster: if nod != node: ep_queue_size_map.update({nod: 0}) vb_replica_queue_size_map.update({nod: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=1200) self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=1200) # replica vBuckets for bucket in self.cluster.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) ############################################################### ''' STEP - 4 -- Kill Memcached on Node - x and trigger rollback on other nodes -- After 20 seconds , flush bucket ''' shell.kill_memcached() self.sleep(20, "sleep after killing memcached") self.bucket_util.flush_bucket(self.cluster, self.cluster.buckets[0]) ############################################################### ''' STEP -5 -- Restarting persistence on Node -- x ''' self.assertTrue(self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) self.log.debug("Iteration=={}, Re-Starting persistence on Node -- {}".format(i, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep(5, "Sleep after re-starting persistence, Iteration{}".format(i)) shell.disconnect() ################################################################### ''' STEP - 6 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' self.create_start = 0 self.create_end = self.init_items_per_collection self.generate_docs(doc_ops="create", target_vbucket=None) self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets, timeout=1200)
def test_failover_log_table_updated(self): """ Verifies failover table entries are updated when vbucket ownership changes """ # rebalance in nodeB nodeA = self.servers[0] nodeB = self.servers[1] # load nodeA only rest = RestConnection(nodeA) vbuckets = rest.get_vbuckets() for vb_info in vbuckets[0:4]: vbucket = vb_info.id self.load_docs(nodeA, vbucket, self.num_items) # add nodeB self.cluster.rebalance([nodeA], [nodeB], []) # stop nodeA and failover assert self.stop_node(0) self.stopped_nodes.append(0) self.master = nodeB assert self.cluster.failover([nodeB], [nodeA]) assert self.cluster.rebalance([nodeB], [], []) # load nodeB only rest = RestConnection(nodeB) vbuckets = rest.get_vbuckets() for vb_info in vbuckets[0:4]: vbucket = vb_info.id self.load_docs(nodeB, vbucket, self.num_items) # add nodeA back assert self.start_node(0) del self.stopped_nodes[0] rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() time.sleep(10) self.cluster.rebalance([nodeB], [nodeA], []) # stop nodeB and failover assert self.stop_node(1) self.master = nodeA self.stopped_nodes.append(1) assert self.cluster.failover([nodeA], [nodeB]) assert self.cluster.rebalance([nodeA], [], []) # load nodeA only rest = RestConnection(nodeA) vbuckets = rest.get_vbuckets() for vb_info in vbuckets[0:4]: vbucket = vb_info.id self.load_docs(nodeA, vbucket, self.num_items) # Create connection for CbStats shell_conn = RemoteMachineShellConnection(self.cluster.master) cb_stat_obj = Cbstats(shell_conn) # Fetch bucket's failover stats bucket = self.bucket_util.buckets[0] stats = cb_stat_obj.failover_stats(bucket.name) # Disconnect the Cbstats shell_conn shell_conn.disconnect() # Fetch vbucket seqno stats vb_stat = cb_stat_obj.vbucket_seqno(bucket.name) # Check failover table entries for vb_info in vbuckets[0:4]: vb = vb_info.id assert long(stats['vb_'+str(vb)+':num_entries']) == 2 dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vb, 0, 0, self.num_items*3, vb_stat[vb]["uuid"]) _ = stream.run() assert stream.last_by_seqno == self.num_items*3, \ stream.last_by_seqno
def test_magma_rollback_n_times(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 100000) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.num_rollbacks = self.input.param("num_rollbacks", 10) shell = RemoteMachineShellConnection(self.cluster_util.cluster.master) cbstats = Cbstats(shell) self.target_vbucket = cbstats.vbucket_list( self.bucket_util.buckets[0].name) start = self.num_items self.gen_read = copy.deepcopy(self.gen_create) for _ in xrange(1, self.num_rollbacks + 1): # Stopping persistence on NodeA mem_client = MemcachedClientHelper.direct_client( self.input.servers[0], self.bucket_util.buckets[0]) mem_client.stop_persistence() self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.loadgen_docs(_sync=True) start = self.gen_create.key_counter ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_only_items } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size") # Kill memcached on NodeA to trigger rollback on other Nodes # replica vBuckets for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket_util.buckets[0], wait_time=self.wait_timeout * 10)) self.sleep(10, "Not Required, but waiting for 10s after warm up") self.bucket_util.verify_stats_all_buckets(items, timeout=300) for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) data_validation = self.task.async_validate_docs( self.cluster, self.bucket_util.buckets[0], self.gen_read, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(data_validation) shell.disconnect()
def test_stream_during_rollback(self): ''' -- Ensure creation of at least a single state file -- Stop persistence on master node -- Start load on master node(say Node A) for a given duration(self.duration * 60 seconds) -- Above step ensures creation of new state files (# equal to self.duration) -- Kill MemCached on master node(Node A) -- Trigger roll back on other/replica nodes -- START STREAMING DATA USING DCP -- ReStart persistence on master node -- Start doc loading on all the nodes(ensure creation of state file) -- Above two steps ensure, roll back to new snapshot -- Repeat all the above steps for num_rollback times -- ''' items = self.num_items mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.duration = self.input.param("duration", 2) self.num_rollbacks = self.input.param("num_rollbacks", 3) shell = RemoteMachineShellConnection(self.cluster.master) cbstats = Cbstats(self.cluster.master) self.target_vbucket = cbstats.vbucket_list( self.cluster.buckets[0].name) ####################################################################### ''' STEP - 1, Stop persistence on master node ''' master_itr = 0 for i in range(1, self.num_rollbacks + 1): start = items self.log.info("Roll back Iteration == {}".format(i)) mem_item_count = 0 # Stopping persistence on NodeA self.log.debug("Iteration == {}, stopping persistence".format(i)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ################################################################### ''' STEP - 2 -- Doc ops on master node for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.log.info("Just before compute docs, iteration {}".format(i)) self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None time_end = time.time() + 60 * self.duration while time.time() < time_end: master_itr += 1 time_start = time.time() mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter if time.time() < time_start + 60: self.sleep( time_start + 60 - time.time(), "master_itr == {}, Sleep to ensure creation of state files for roll back," .format(master_itr)) self.log.info("master_itr == {}, state files== {}".format( master_itr, self.get_state_files(self.buckets[0]))) ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_item_count } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=300) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=300) # replica vBuckets for bucket in self.cluster.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) ################################################################### ''' STEP - 3 -- Kill Memcached on master node(Node A) and trigger rollback on replica/other nodes -- Start streaming data (through DCP) ''' shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) output_string = self.dcp_util.get_dcp_event() actual_item_count = len( list(filter(lambda x: 'CMD_MUTATION' in x, output_string))) self.log.info("actual_item_count is {}".format(actual_item_count)) msg = "item count mismatch, expected {} actual {}" self.assertIs(actual_item_count == self.num_items, True, msg.format(self.num_items, actual_item_count)) ################################################################### ''' STEP -4 -- Restarting persistence on master node(Node A) ''' self.log.debug("Iteration=={}, Re-Starting persistence".format(i)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep( 5, "Iteration=={}, sleep after restarting persistence".format(i)) ################################################################### ''' STEP - 5 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' if i != self.num_rollbacks: self.create_start = items self.create_end = items + 50000 self.generate_docs(doc_ops="create", target_vbucket=None) _ = self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=1200) items = items + 50000 self.log.debug("Iteration == {}, Total num_items {}".format( i, items)) shell.disconnect()