def setUp(self): super(OpsChangeCasTests, self).setUp() self.key = "test_cas" self.expire_time = self.input.param("expire_time", 35) self.item_flag = self.input.param("item_flag", 0) self.load_gen = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size) self.node_data = dict() for node in self.cluster_util.get_kv_nodes(): shell = RemoteMachineShellConnection(node) cb_stat = Cbstats(shell) self.node_data[node.ip] = dict() self.node_data[node.ip]["shell"] = shell self.node_data[node.ip]["cb_stat"] = Cbstats(shell) self.node_data[node.ip]["active"] = cb_stat.vbucket_list( self.bucket, "active") self.node_data[node.ip]["replica"] = cb_stat.vbucket_list( self.bucket, "replica") if self.sdk_client_pool: self.client = self.sdk_client_pool.get_client_for_bucket( self.bucket) else: self.client = SDKClient([self.cluster.master], self.bucket)
def collect_vbucket_num_stats(self, servers, buckets): """ Method to extract the failovers stats given by cbstats tool Paramters: buckets: bucket informaiton servers: server information Returns: Failover stats as follows: if not collecting per node :: {bucket : [{key:value}]} if collecting per node :: {bucket : {node:[{key:value}]}} """ active_bucketMap = {} replica_bucketMap = {} for bucket in buckets: active_map_data = {} replica_map_data = {} for server in servers: #client = MemcachedClientHelper.direct_client(server, bucket) #stats = client.stats('') cbstat = Cbstats(server) stats = cbstat.vbucket_list(bucket.name) active_map_data[server.ip] = len(stats) stats = cbstat.vbucket_list(bucket.name, vbucket_type="replica") replica_map_data[server.ip] = len(stats) # for key in stats.keys(): # if key == 'vb_active_num': # active_map_data[server.ip] = int(stats[key]) # if key == 'vb_replica_num': # replica_map_data[server.ip] = int(stats[key]) active_bucketMap[bucket.name] = active_map_data replica_bucketMap[bucket.name] = replica_map_data return active_bucketMap, replica_bucketMap
def online_swap(self, node_to_upgrade, version, install_on_spare_node=True): vb_details = dict() vb_verification = dict() vb_types = ["active", "replica"] # Fetch active services on node_to_upgrade rest = self.__get_rest_node(node_to_upgrade) services = rest.get_nodes_services() services_on_target_node = services[(node_to_upgrade.ip + ":" + node_to_upgrade.port)] # Record vbuckets in swap_node if CbServer.Services.KV in services_on_target_node: cbstats = Cbstats(node_to_upgrade) for vb_type in vb_types: vb_details[vb_type] = \ cbstats.vbucket_list(self.bucket.name, vb_type) if install_on_spare_node: # Install target version on spare node self.install_version_on_node([self.spare_node], version) # Perform swap rebalance for node_to_upgrade <-> spare_node rebalance_passed = self.task.rebalance( self.cluster_util.get_nodes(self.cluster.master), to_add=[self.spare_node], to_remove=[node_to_upgrade], check_vbucket_shuffling=False, services=[",".join(services_on_target_node)]) if not rebalance_passed: self.log_failure("Swap rebalance failed during upgrade of {0}" .format(node_to_upgrade)) # VBuckets shuffling verification if CbServer.Services.KV in services_on_target_node: # Fetch vbucket stats after swap rebalance for verification cbstats = Cbstats(self.spare_node) for vb_type in vb_types: vb_verification[vb_type] = \ cbstats.vbucket_list(self.bucket.name, vb_type) # Check vbuckets are shuffled or not for vb_type in vb_types: if vb_details[vb_type].sort() \ != vb_verification[vb_type].sort(): self.log_failure("%s vbuckets shuffled post swap_rebalance" % vb_type) self.log.error("%s vbuckets before vs after: %s != %s" % (vb_type, vb_details[vb_type], vb_verification[vb_type])) # Update master node self.cluster.master = self.spare_node self.cluster.nodes_in_cluster.append(self.spare_node) # Update spare_node to rebalanced-out node self.spare_node = node_to_upgrade self.cluster.nodes_in_cluster.remove(node_to_upgrade)
def get_vbucket_type_mapping(self, bucket_name): for node in self.vbs_in_node.keys(): cb_stat = Cbstats(self.vbs_in_node[node]["shell"]) self.vbs_in_node[node]["active"] = \ cb_stat.vbucket_list(bucket_name, "active") self.vbs_in_node[node]["replica"] = \ cb_stat.vbucket_list(bucket_name, "replica")
def setUp(self): super(OutOfOrderReturns, self).setUp() self.ooo_order = 0 self.test_lock = Lock() self.doc_ops = self.input.param("doc_ops", "update;update").split(";") # Initialize cluster using given nodes nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Create default bucket and add rbac user self.bucket_util.create_default_bucket( bucket_type=self.bucket_type, storage=self.bucket_storage, ram_quota=self.bucket_size, replica=self.num_replicas, compression_mode=self.compression_mode, eviction_policy=self.bucket_eviction_policy) self.cluster.nodes_in_cluster.extend([self.cluster.master]) self.bucket = self.bucket_util.buckets[0] # Create sdk_clients for pool if self.sdk_client_pool: self.log.info("Creating SDK client pool") self.sdk_client_pool.create_clients( self.bucket, self.cluster.nodes_in_cluster, req_clients=self.sdk_pool_capacity, compression_settings=self.sdk_compression) # Create shell connection to each kv_node for cbstat object self.kv_nodes = self.cluster_util.get_kv_nodes() self.node_data = dict() for node in self.kv_nodes: shell = RemoteMachineShellConnection(node) cb_stat = Cbstats(shell) self.node_data[node] = dict() self.node_data[node]["shell"] = shell self.node_data[node]["cb_stat"] = cb_stat self.node_data[node]["active_vbs"] = \ cb_stat.vbucket_list(self.bucket.name, vbucket_type="active") self.node_data[node]["replica_vbs"] = \ cb_stat.vbucket_list(self.bucket.name, vbucket_type="replica") # Print cluster & bucket stats self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats()
def rebalance_out_with_warming_up(self): master_restart = self.input.param("master_restart", False) if master_restart: warmup_node = self.cluster.master else: warmup_node = self.cluster.servers[len(self.cluster.servers) - self.nodes_out - 1] servs_out = self.cluster.servers[len(self.cluster.servers) - self.nodes_out:] if self.test_abort_snapshot: self.log.info("Creating sync_write abort scenario for replica vbs") for server in self.cluster_util.get_kv_nodes(self.cluster): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.cluster.buckets[0].name, "replica") load_gen = doc_generator(self.key, 0, 5000, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.cluster.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() shell = RemoteMachineShellConnection(warmup_node) shell.stop_couchbase() self.sleep(20) shell.start_couchbase() shell.disconnect() # Workaround for Eph case (MB-44682 - Not a bug) if self.bucket_type == Bucket.Type.EPHEMERAL: self.sleep(15, "Wait for couchbase server to start") rebalance = self.task.async_rebalance( self.cluster.servers, [], servs_out) self.task.jython_task_manager.get_task_result(rebalance) self.assertTrue(rebalance.result, "Rebalance Failed") self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) if rebalance.result is False: self.log.info("Rebalance was failed as expected") self.assertTrue(self.bucket_util._wait_warmup_completed( self.cluster_util.get_kv_nodes(self.cluster), self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) self.log.info("Second attempt to rebalance") rebalance = self.task.async_rebalance( self.cluster.servers, [], servs_out) self.task.jython_task_manager.get_task_result(rebalance) self.assertTrue(rebalance.result, "Rebalance attempt failed again") self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) if not self.atomicity: self.bucket_util.verify_cluster_stats(self.cluster, self.num_items, timeout=self.wait_timeout) self.bucket_util.verify_unacked_bytes_all_buckets(self.cluster)
def load_docs_in_cb_bucket_before_and_after_cbas_connect(self): self.setup_for_test() # Load more docs in Couchbase bucket. self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.bucket_util.verify_stats_all_buckets(self.num_items * 2) if self.test_abort_snapshot: self.log.info("Creating sync_write aborts after dataset connect") for server in self.cluster_util.get_kv_nodes(): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.bucket_util.buckets[0].name, "replica") load_gen = doc_generator("test_abort_key", self.num_items, self.num_items, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.bucket_util.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() # Validate no. of items in CBAS dataset if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail("No. of items in CBAS dataset do not match " "that in the CB bucket")
def get_vbucket_info_from_failover_nodes(self): """ Fetch active/replica vbucket list from the nodes which are going to be failed over """ bucket = self.bucket_util.buckets[0] # Reset the values self.active_vb_in_failover_nodes = list() self.replica_vb_in_failover_nodes = list() # Fetch new vbucket list for node in self.server_to_fail: shell_conn = RemoteMachineShellConnection(node) cbstat = Cbstats(shell_conn) self.active_vb_in_failover_nodes += cbstat.vbucket_list( bucket.name, "active") self.replica_vb_in_failover_nodes += cbstat.vbucket_list( bucket.name, "replica")
def setUp(self): super(OutOfOrderReturns, self).setUp() # Create default bucket self.create_bucket(self.cluster) self.ooo_order = 0 self.test_lock = Lock() self.doc_ops = self.input.param("doc_ops", "update;update").split(";") # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") self.cluster.nodes_in_cluster.extend([self.cluster.master]) self.bucket = self.cluster.buckets[0] # Create sdk_clients for pool if self.sdk_client_pool: self.log.info("Creating SDK client pool") self.sdk_client_pool.create_clients( self.bucket, self.cluster.nodes_in_cluster, req_clients=self.sdk_pool_capacity, compression_settings=self.sdk_compression) # Create shell connection to each kv_node for cbstat object self.kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) self.node_data = dict() for node in self.kv_nodes: shell = RemoteMachineShellConnection(node) cb_stat = Cbstats(shell) self.node_data[node] = dict() self.node_data[node]["shell"] = shell self.node_data[node]["cb_stat"] = cb_stat self.node_data[node]["active_vbs"] = \ cb_stat.vbucket_list(self.bucket.name, vbucket_type="active") self.node_data[node]["replica_vbs"] = \ cb_stat.vbucket_list(self.bucket.name, vbucket_type="replica") # Print cluster & bucket stats self.cluster_util.print_cluster_stats(self.cluster) self.bucket_util.print_bucket_stats(self.cluster)
def test_failover_expired_items_in_vB(self): self.maxttl = 120 self.doc_ops = "expiry" self.expiry_perc = self.input.param("expiry_perc", 100) shell_conn = RemoteMachineShellConnection( self.cluster.nodes_in_cluster[-1]) cbstats = Cbstats(shell_conn) self.target_vbucket = cbstats.vbucket_list( self.bucket_util.buckets[0].name) self.generate_docs(target_vbucket=self.target_vbucket) _ = self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True) self.bucket_util._wait_for_stats_all_buckets() # exp_pager_stime self.bucket_util._expiry_pager(self.exp_pager_stime) self.sleep( self.exp_pager_stime, "Wait until exp_pager_stime for kv_purger\ to kickoff") self.sleep( self.exp_pager_stime * 10, "Wait for KV purger to scan expired docs and add \ tombstones.") self.task.async_failover(self.cluster.nodes_in_cluster, self.cluster.nodes_in_cluster[-1], graceful=True) self.nodes = self.rest.node_statuses() self.task.rebalance(self.cluster.nodes_in_cluster, to_add=[], to_remove=[self.cluster.nodes_in_cluster[-1]]) # Metadata Purge Interval self.meta_purge_interval = 60 self.bucket_util.cbepctl_set_metadata_purge_interval( value=self.meta_purge_interval, buckets=self.buckets) self.sleep( self.meta_purge_interval * 2, "Wait for Metadata Purge Interval to drop \ tomb-stones from storage") self.log.info("Starting compaction for each bucket") self.run_compaction() # All docs and tomb-stone should be dropped from the storage ts = self.get_tombstone_count_key(self.cluster.nodes_in_cluster) self.log.info("Tombstones after full compaction: {}".format(ts))
def rebalance_out_after_ops(self): self.gen_delete = self.get_doc_generator(self.items / 2, self.items) self.gen_create = self.get_doc_generator(self.num_items, self.num_items + self.items / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one self.check_temporary_failure_exception = False self.loadgen_docs(task_verification=True) if self.test_abort_snapshot: self.log.info("Creating sync_write abort scenario for replica vbs") for server in self.cluster_util.get_kv_nodes(self.cluster): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.cluster.buckets[0].name, "replica") load_gen = doc_generator(self.key, 0, 5000, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.cluster.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() servs_out = [self.cluster.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)] if not self.atomicity: self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster, timeout=self.wait_timeout) prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.servers[:self.nodes_init], self.cluster.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.servers[:self.nodes_init], self.cluster.buckets) # record_data_set = self.bucket_util.get_data_set_all(self.cluster.servers[:self.nodes_init], self.cluster.buckets) self.bucket_util.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.add_remove_servers_and_rebalance([], servs_out) if not self.atomicity: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.bucket_util.verify_cluster_stats( self.cluster, self.num_items, check_ep_items_remaining=True, timeout=self.wait_timeout) new_failover_stats = self.bucket_util.compare_failovers_logs( self.cluster, prev_failover_stats, self.cluster.servers[:self.nodes_init - self.nodes_out], self.cluster.buckets) new_vbucket_stats = self.bucket_util.compare_vbucket_seqnos( self.cluster, prev_vbucket_stats, self.cluster.servers[:self.nodes_init - self.nodes_out], self.cluster.buckets, perNode=False) self.sleep(60) # self.bucket_util.data_analysis_all(record_data_set, self.cluster.servers[:self.nodes_init - self.nodes_out], self.cluster.buckets) self.bucket_util.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) self.bucket_util.verify_unacked_bytes_all_buckets(self.cluster) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster) self.bucket_util.vb_distribution_analysis( self.cluster, servers=nodes, buckets=self.cluster.buckets, std=1.0, total_vbuckets=self.cluster.vbuckets, num_replicas=self.num_replicas)
def test_fts_index_with_aborts(self): """ 1. Create index (2i/view) on default bucket 2. Load multiple docs such that all sync_writes will be aborted 3. Verify nothing went into indexing 4. Load sync_write docs such that they are successful 5. Validate the mutated docs are taken into indexing :return: """ self.key = "test_query_doc" self.index_name = "fts_test_index" self.sync_write_abort_pattern = self.input.param( "sync_write_abort_pattern", "all_aborts") self.create_index_during = self.input.param("create_index_during", "before_doc_ops") self.restServer = self.cluster_util.get_nodes_from_services_map( cluster=self.cluster, service_type=CbServer.Services.FTS) self.rest = RestConnection(self.restServer) crud_batch_size = 1000 def_bucket = self.cluster.buckets[0] kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) replica_vbs = dict() verification_dict = dict() index_item_count = dict() expected_num_indexed = dict() load_gen = dict() load_gen["ADD"] = dict() load_gen["SET"] = dict() partial_aborts = ["initial_aborts", "aborts_at_end"] durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) if self.create_index_during == "before_doc_ops": self.create_fts_indexes(def_bucket.name, self.index_name) curr_items = self.bucket_util.get_bucket_current_item_count( self.cluster, def_bucket) if self.sync_write_abort_pattern in ["all_aborts", "initial_aborts"]: self.bucket_util.flush_bucket(self.cluster, def_bucket) self.num_items = 0 else: self.num_items = curr_items self.log.info("Disabling auto_failover to avoid node failures") status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Validate vbucket stats verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 # verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(self.index_name, verification_dict["ops_create"]) self.log.info("Loading docs such that all sync_writes will be aborted") for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs[server] = cbstats.vbucket_list(def_bucket.name, "replica") load_gen["ADD"][server] = list() load_gen["ADD"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) if self.sync_write_abort_pattern in partial_aborts: load_gen["ADD"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) verification_dict["ops_create"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["ADD"][server], def_bucket, self.durability_level, "create", self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") verification_dict["sync_write_aborted_count"] += \ crud_batch_size if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count( self.index_name, verification_dict["ops_create"]) load_gen["SET"][server] = list() load_gen["SET"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) if self.sync_write_abort_pattern in partial_aborts: load_gen["SET"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) verification_dict["ops_update"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size verification_dict["sync_write_aborted_count"] += \ crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["SET"][server], def_bucket, self.durability_level, "update", self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") ssh_shell.disconnect() if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count( self.index_name, verification_dict["ops_create"]) failed = durability_helper.verify_vbucket_details_stats( def_bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=verification_dict) # if failed: # self.sleep(6000) # self.log_failure("Cbstat vbucket-details verification failed") self.validate_test_failure() if self.create_index_during == "after_doc_ops": self.create_fts_indexes(def_bucket.name, self.index_name) self.validate_indexed_doc_count(self.index_name, verification_dict["ops_create"]) self.log.info("Verify aborts are not indexed") self.validate_indexed_doc_count(self.index_name, verification_dict["ops_create"]) for server in kv_nodes: if self.sync_write_abort_pattern == "initial_aborts": load_gen["ADD"][server] = load_gen["ADD"][server][:1] load_gen["SET"][server] = load_gen["SET"][server][:1] elif self.sync_write_abort_pattern == "aborts_at_end": load_gen["ADD"][server] = load_gen["ADD"][server][-1:] load_gen["SET"][server] = load_gen["SET"][server][-1:] self.log.info("Load sync_write docs such that they are successful") for server in kv_nodes: for gen_load in load_gen["ADD"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") verification_dict["ops_create"] += crud_batch_size self.validate_indexed_doc_count( self.index_name, verification_dict["ops_create"]) for gen_load in load_gen["SET"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "update", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") verification_dict["ops_update"] += crud_batch_size self.validate_indexed_doc_count( self.index_name, verification_dict["ops_create"]) self.log.info("Validate the mutated docs are taken into indexing") self.validate_indexed_doc_count(self.index_name, verification_dict["ops_create"]) self.validate_test_failure()
def test_durability_abort(self): """ Test to validate durability abort is triggered properly with proper rollback on active vbucket :return: """ load_task = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(server) cb_err = CouchbaseError(self.log, ssh_shell) target_vb_type = "replica" if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" target_vbs = cbstats.vbucket_list(self.bucket.name, target_vb_type) doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbs doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] \ = self.durability_level doc_load_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.DurabilityAmbiguousException ] doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 2 doc_load_spec[MetaCrudParams.SKIP_READ_ON_ERROR] = True doc_load_spec[MetaCrudParams.SUPPRESS_ERROR_TABLE] = True cb_err.create(self.simulate_error, self.cluster.buckets[0].name) load_task[server] = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, batch_size=1, validate_task=False) cb_err.revert(self.simulate_error, self.cluster.buckets[0].name) ssh_shell.disconnect() self.validate_test_failure() failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification failed " "after aborts") self.validate_test_failure() # Retry aborted keys with healthy cluster self.log.info("Performing CRUDs on healthy cluster") for server in kv_nodes: self.bucket_util.validate_doc_loading_results(load_task[server]) if load_task[server].result is False: self.log_failure("Doc retry task failed on %s" % server.ip) # Update cbstat vb-details verification counters for bucket, s_dict in load_task[server].loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, _ in c_dict["collections"].items(): c_crud_data = load_task[server].loader_spec[bucket][ "scopes"][s_name]["collections"][c_name] for op_type in c_crud_data.keys(): total_mutation = \ c_crud_data[op_type]["doc_gen"].end \ - c_crud_data[op_type]["doc_gen"].start if op_type in DocLoading.Bucket.DOC_OPS: self.verification_dict["ops_%s" % op_type] \ += total_mutation self.verification_dict[ "sync_write_committed_count"] \ += total_mutation failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification " "failed after ops on server: %s" % server.ip) self.validate_test_failure()
def collect_vbucket_stats(self, buckets, servers, collect_vbucket=True, collect_vbucket_seqno=True, collect_vbucket_details=True, perNode=True): """ Method to extract the vbuckets stats given by cbstats tool Paramters: buckets: bucket information servers: server information collect_vbucket: take vbucket type stats collect_vbucket_seqno: take vbucket-seqno type stats collect_vbucket_details: take vbucket-details type stats perNode: if True collects data per node else takes a union across nodes Returns: The output can be in two formats if we are doing per node data collection Vbucket Information :: {bucket { node : [vbucket_seqno {key:value} U vbucket_details {key:value} U vbucket {key:value}]}} if we are not doing per node data collection Vbucket Information :: {bucket : [vbucket_seqno {key:value} U vbucket_details {key:value} U vbucket {key:value}]} """ bucketMap = dict() for bucket in buckets: if bucket.bucketType == Bucket.Type.MEMCACHED: continue dataMap = dict() for server in servers: map_data = dict() cbstat = Cbstats(server) if collect_vbucket: result = dict() for vb_type in ["active", "replica"]: vb_list = cbstat.vbucket_list(bucket.name, vb_type) for vb_num in vb_list: result['vb_%s' % vb_num] = dict() result['vb_%s' % vb_num]["state"] = vb_type map_data.update(result) # vbucket = client.stats('vbucket') # self.createMapVbucket(vbucket, map_data) if collect_vbucket_seqno: result = cbstat.vbucket_seqno(bucket.name) for key in result.keys(): result['vb_' + key] = result.pop(key) map_data.update(result) # vbucket_seqno = client.stats('vbucket-seqno') # self.createMapVbucket(vbucket_seqno, map_data) if collect_vbucket_details: result = cbstat.vbucket_details(bucket.name) for key in result.keys(): result['vb_' + key] = result.pop(key) map_data.update(result) # vbucket_details = client.stats('vbucket-details') # self.createMapVbucket(vbucket_details, map_data) if perNode: dataMap[server.ip] = map_data else: dataMap.update(map_data) bucketMap[bucket.name] = dataMap return bucketMap
def test_create_remove_scope_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after scope create/delete 3. Initiate scope creation/deletion under the bucket 4. Validate the outcome of scope creation/deletion """ def create_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.create_scope(scope) elif client_type == "rest": self.bucket_util.create_scope(self.cluster.master, bucket_obj, {"name": scope}) else: self.log_failure("Invalid client_type provided") def remove_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.drop_scope(scope) elif client_type == "rest": self.bucket_util.drop_scope(self.cluster.master, bucket_obj, scope) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) # Always use a random scope name to create/remove # since CREATE/DROP not supported for default scope self.scope_name = BucketUtils.get_random_name() # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] # Create a required client object if self.client_type == "sdk": client = SDKClient([self.cluster.master], self.bucket) if action == "remove": # Create a scope to be removed use_client = sample(["sdk", "rest"], 1)[0] create_scope(use_client, self.bucket, self.scope_name) # Create a error scenario shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if action == "create": create_scope(self.client_type, self.bucket, self.scope_name) elif action == "remove": remove_scope(self.client_type, self.bucket, self.scope_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, "update", exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.client_type == "sdk": client.close() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_rollback_after_disk_full(self): self.doc_ops = "create" self.create_start = self.init_items_per_collection self.create_end = self.init_items_per_collection * 2 start = self.num_items items = self.num_items mem_only_items = self.input.param("rollback_items", 100000) self.gen_read = copy.deepcopy(self.gen_create) # Fill Disk on nodeB leaving 100MB self.fill_disk(self.cluster.nodes_in_cluster[-1], free=100) # Stopping persistence on NodeA shell = RemoteMachineShellConnection(self.cluster.master) cbstats = Cbstats(self.cluster.master) self.target_vbucket = cbstats.vbucket_list( self.cluster.buckets[0].name) mem_client = MemcachedClientHelper.direct_client( self.cluster.master, self.cluster.buckets[0]) mem_client.stop_persistence() self.gen_create = self.genrate_docs_basic(start, mem_only_items, self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) start = self.gen_create.key_counter ep_queue_size_map = {self.cluster.nodes_in_cluster[0]: mem_only_items} #ep_data_write_failed = {self.cluster.nodes_in_cluster[-1]: 0} for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map) #self.bucket_util._wait_for_stat( # bucket, # ep_data_write_failed, # cbstat_cmd="all", # stat_name="ep_data_write_failed", # stat_cond=">", # timeout=300) # Kill memcached on NodeA to trigger rollback on other Nodes # replica vBuckets self.sleep(120) shell.kill_memcached() self.sleep(10, "sleep after MemCached kill on node {}".format(shell.ip)) self.free_disk(self.cluster.nodes_in_cluster[-1]) self.assertTrue( self.bucket_util._wait_warmup_completed( self.cluster.nodes_in_cluster, self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) self.sleep(10, "Not Required, but waiting for 10s after warm up") self.bucket_util.verify_stats_all_buckets(self.cluster, items, timeout=300) data_validation = self.task.async_validate_docs( self.cluster, self.cluster.buckets[0], self.gen_read, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, sdk_client_pool=self.sdk_client_pool, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(data_validation) shell.disconnect()
def common_test_body(self, failover_reason, rebalance_type=None): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Variable to decide the durability outcome durability_will_fail = False # Variable to track the number of nodes failed num_nodes_failed = 1 # Find nodes that will under go failover if self.failoverMaster: self.chosen = self.cluster_util.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = self.cluster_util.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() if self.test_abort_snapshot: self.log.info("Creating abort scenarios for vbs") for server in self.cluster_util.get_kv_nodes(): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.bucket_util.buckets[0].name, "replica") load_gen = doc_generator(self.key, 0, 5000, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.bucket_util.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() # Perform View Creation Tasks and # check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validation record_static_data_set = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.bucket_util.buckets, path=None) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers[:self.nodes_init], self.bucket_util.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers[:self.nodes_init], self.bucket_util.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # Decide whether the durability is going to fail or not if self.num_failed_nodes >= 1 and self.num_replicas > 1: durability_will_fail = True # Construct target vbucket list from the nodes # which are going to be failed over vbucket_list = list() for target_node in self.chosen: for server in self.servers: if server.ip == target_node.ip: # Comment out the break once vbucket_list method is fixed break shell_conn = RemoteMachineShellConnection(server) cb_stats = Cbstats(shell_conn) vbuckets = cb_stats.vbucket_list( self.bucket_util.buckets[0].name, self.target_vbucket_type) shell_conn.disconnect() vbucket_list += vbuckets # Code to generate doc_loaders that will work on vbucket_type # based on targeted nodes. This will perform CRUD only on # vbuckets which will be affected by the failover self.gen_create = doc_generator(self.key, self.num_items, self.num_items * 1.5, target_vbucket=vbucket_list) self.gen_update = doc_generator(self.key, self.num_items / 2, self.num_items, target_vbucket=vbucket_list) self.gen_delete = doc_generator(self.key, self.num_items / 4, self.num_items / 2 - 1, target_vbucket=vbucket_list) self.afterfailover_gen_create = doc_generator( self.key, self.num_items * 1.6, self.num_items * 2, target_vbucket=vbucket_list) self.afterfailover_gen_update = doc_generator( self.key, 1, self.num_items / 4, target_vbucket=vbucket_list) self.afterfailover_gen_delete = doc_generator( self.key, self.num_items * 0.5, self.num_items * 0.75, target_vbucket=vbucket_list) # Perform Add Back Operation with Rebalance # or only Rebalance with verifications if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.failover_onebyone: # Reset it back to False durability_will_fail = False for node_chosen in self.chosen: if num_nodes_failed > 1: durability_will_fail = True if self.add_back_flag: # In add-back case, durability should never fail, since # the num_nodes in the cluster will remain the same self.run_add_back_operation_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) num_nodes_failed += 1 else: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) else: return # Will verify_unacked_bytes only if the durability is not going to fail if self.during_ops is None and not durability_will_fail: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list)
def test_magma_rollback_n_times(self): items = self.num_items mem_only_items = self.input.param("rollback_items", 100000) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.num_rollbacks = self.input.param("num_rollbacks", 10) shell = RemoteMachineShellConnection(self.cluster_util.cluster.master) cbstats = Cbstats(shell) self.target_vbucket = cbstats.vbucket_list( self.bucket_util.buckets[0].name) start = self.num_items self.gen_read = copy.deepcopy(self.gen_create) for _ in xrange(1, self.num_rollbacks + 1): # Stopping persistence on NodeA mem_client = MemcachedClientHelper.direct_client( self.input.servers[0], self.bucket_util.buckets[0]) mem_client.stop_persistence() self.gen_create = doc_generator( self.key, start, mem_only_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.loadgen_docs(_sync=True) start = self.gen_create.key_counter ep_queue_size_map = { self.cluster.nodes_in_cluster[0]: mem_only_items } vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size") # Kill memcached on NodeA to trigger rollback on other Nodes # replica vBuckets for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) shell.kill_memcached() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket_util.buckets[0], wait_time=self.wait_timeout * 10)) self.sleep(10, "Not Required, but waiting for 10s after warm up") self.bucket_util.verify_stats_all_buckets(items, timeout=300) for bucket in self.bucket_util.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) data_validation = self.task.async_validate_docs( self.cluster, self.bucket_util.buckets[0], self.gen_read, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(data_validation) shell.disconnect()
def test_MB_40531(self): """ Test to validate, 1. Active resident ratio on the nodes never goes down below the replica_rr value 2. 'evictable' (vb_replica_itm_mem - vb_replica_meta_data_mem) value never goes below wm_threshold of total bucket memory (ep_max_size) :return: """ def check_replica_eviction(): tbl = TableView(self.log.info) tbl.set_headers([ "Node", "Memory", "WM_Threshold", "Itm_mem", "Meta_mem", "Evictable_mem", "A_rr", "R_rr" ]) while self.test_failure is None and run_eviction_check: tbl.rows = [] for kv_node in node_data.keys(): all_stats = \ node_data[kv_node]["cbstat"].all_stats(bucket.name) bucket_mem = int(all_stats["ep_max_size"]) wm_threshold = \ (float(all_stats["ep_mem_high_wat_percent"]) - float(all_stats["ep_mem_low_wat_percent"]))*100 evictable_mem = \ int(all_stats["vb_replica_itm_memory"]) \ - int(all_stats["vb_replica_meta_data_memory"]) active_rr = int(all_stats["vb_active_perc_mem_resident"]) replica_rr = int(all_stats["vb_replica_perc_mem_resident"]) tbl.add_row([ kv_node.ip, str(bucket_mem), str(wm_threshold), all_stats["vb_replica_itm_memory"], all_stats["vb_replica_meta_data_memory"], str(evictable_mem), str(active_rr), str(replica_rr) ]) if active_rr != 100 \ and evictable_mem > (bucket_mem/wm_threshold): tbl.display("Node memory stats") self.log_failure("%s - Active keys evicted before " "meeting the threshold: %s" % (kv_node.ip, all_stats)) if replica_rr > active_rr: tbl.display("Node memory stats") self.log_failure( "%s: (active_rr) %s < %s (replica_rr)" % (kv_node.ip, active_rr, replica_rr)) bucket = self.bucket_util.buckets[0] node_data = dict() kv_nodes = self.cluster_util.get_kv_nodes() for node in kv_nodes: cbstat = Cbstats(RemoteMachineShellConnection(node)) node_data[node] = dict() node_data[node]["cbstat"] = cbstat node_data[node]["active"] = cbstat.vbucket_list( bucket.name, "active") node_data[node]["replica"] = cbstat.vbucket_list( bucket.name, "replica") target_dgm = 30 run_eviction_check = True bucket_helper = BucketHelper(self.cluster.master) eviction_check_thread = Thread(target=check_replica_eviction) eviction_check_thread.start() op_index = 0 op_batch_size = 8000 create_batch_size = 10000 # Perform ADD/SET/READ until targeted DGM value is reached curr_dgm = bucket_helper.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Wait for DGM to reach %s%%. Current DGM: %s%%" % (target_dgm, curr_dgm)) while int(curr_dgm) > target_dgm and self.test_failure is None: create_gen = doc_generator(self.key, self.num_items, self.num_items + create_batch_size, key_size=self.key_size, doc_size=self.doc_size, mutation_type="ADD") update_gen = doc_generator(self.key, op_index, op_index + op_batch_size, key_size=self.key_size, doc_size=self.doc_size, mutation_type="ADD") read_gen = doc_generator(self.key, op_index, op_index + op_batch_size, key_size=self.key_size, doc_size=0) create_task = self.task.async_load_gen_docs( self.cluster, bucket, create_gen, "create", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) update_task = self.task.async_load_gen_docs( self.cluster, bucket, update_gen, "update", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) read_task = self.task.async_load_gen_docs( self.cluster, bucket, read_gen, "read", timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) self.task_manager.get_task_result(create_task) self.task_manager.get_task_result(update_task) self.task_manager.get_task_result(read_task) # Update indexes for next iteration op_index += op_batch_size self.num_items += create_batch_size curr_dgm = bucket_helper.fetch_bucket_stats( bucket.name )["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Current DGM: %s%%" % curr_dgm) # Stop eviction check thread run_eviction_check = False eviction_check_thread.join() # Close shell connections for node in kv_nodes: node_data[node]["cbstat"].shellConn.disconnect() self.validate_test_failure()
def test_ttl_less_than_durability_timeout(self): """ MB-43238 1. Regular write with TTL 1 second for some key 2. Disable expiry pager (to prevent raciness) 3. Wait TTL period 4. Disable persistence on the node with the replica vBucket for that key 5. SyncWrite PersistMajority to active vBucket for that key (should hang) 6. Access key on other thread to trigger expiry 7. Observe DCP connection being torn down without fix """ def perform_sync_write(): client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, durability=Bucket.DurabilityLevel.PERSIST_TO_MAJORITY, timeout=60) doc_ttl = 5 target_node = None key = "test_ttl_doc" vb_for_key = self.bucket_util.get_vbucket_num_for_key(key) bucket = self.cluster.buckets[0] # Find target node for replica VB for target_node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(target_node) if vb_for_key in cb_stats.vbucket_list(bucket.name, "replica"): break self.log.info("Target node: %s, Key: %s" % (target_node.ip, key)) self.log.info("Disabling expiry_pager") shell = RemoteMachineShellConnection(target_node) cb_ep_ctl = Cbepctl(shell) cb_ep_ctl.set(bucket.name, "flush_param", "exp_pager_stime", 0) # Create SDK client client = SDKClient([self.cluster.master], bucket) self.log.info("Non-sync write with TTL=%s" % doc_ttl) client.crud(DocLoading.Bucket.DocOps.CREATE, key, {}, exp=doc_ttl) self.sleep(doc_ttl, "Wait for document to expire") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.log.info("Stopping persistence on replica VB node using cbepctl") cb_ep_ctl.persistence(bucket.name, "stop") # Start doc_load with lesser ttl doc_create_thread = Thread(target=perform_sync_write) doc_create_thread.start() self.sleep(2, "Wait for sync_write thread to start") self.log.info("Read key from another thread to trigger expiry") failure = None result = client.crud(DocLoading.Bucket.DocOps.READ, key) if SDKException.DocumentNotFoundException not in str(result["error"]): failure = "Invalid exception: %s" % result["error"] self.log.info("Resuming persistence on target node") cb_ep_ctl.persistence(bucket.name, "start") # Wait for doc_create_thread to complete doc_create_thread.join() # Close SDK client and shell connections client.close() shell.disconnect() if failure: self.fail(failure) for node in self.cluster.nodes_in_cluster: cb_stats = Cbstats(node).all_stats(bucket.name) self.log.info("Node: %s, ep_expired_access: %s" % (node.ip, cb_stats["ep_expired_access"])) self.assertEqual(int(cb_stats["ep_expired_access"]), 0, "%s: ep_expired_access != 0" % node.ip)
def test_maxttl_with_timeout(self): """ 1. Stop Memcached on target_nodes based on replicas configured. 2. Initiate doc_ops with higher sdk_timeout 3. Sleep for time within the configured sdk_timeout 4. Resume Memcached on target_nodes to make sure doc_ops go through 5. Make sure maxTTL is calculated as soon as the active vbucket receives the mutation :return: """ shell_conn = dict() target_vbuckets = list() target_nodes = self.getTargetNodes() def_bucket = self.cluster.buckets[0] self.maxttl = self.input.param("doc_ttl", self.maxttl) # Open required SDK connections before error_simulation gen_create = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster.vbuckets) doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", self.maxttl, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, start_task=False, sdk_client_pool=self.sdk_client_pool) # Open shell_conn and create Memcached error for testing MaxTTL self.log.info("1. Stopping Memcached on target_nodes") for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstats = Cbstats(shell_conn[node.ip]) target_vbuckets += cbstats.vbucket_list(def_bucket.name, "replica") cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.create(CouchbaseError.STOP_MEMCACHED, def_bucket.name) self.log.info("2. Initiating the doc_ops with doc TTL") self.task_manager.add_new_task(doc_op_task) self.sleep(self.maxttl, "3. Sleep for max_ttl time") # Revert Memcached error and close the shell_conn self.log.info("4. Resuming Memcached on target_nodes") for node in target_nodes: cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.revert(CouchbaseError.STOP_MEMCACHED, def_bucket.name) shell_conn[node.ip].disconnect() self.log.info("5. Waiting for doc_ops to complete") self.task.jython_task_manager.get_task_result(doc_op_task) self.bucket_util._expiry_pager(self.cluster, val=1) self.sleep(10, "6. Waiting for items to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("7. Validating docs expired after TTL, " "even before sync_write succeeds") if len(doc_op_task.success.keys()) == self.num_items: self.fail("No docs deleted after MaxTTL time: %s" % doc_op_task.success.keys()) self.sleep(10, "8. Waiting for all docs to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("9. Validating docs expired after TTL") if len(doc_op_task.fail.keys()) != self.num_items: self.fail("Items not deleted after MaxTTL time: %s" % doc_op_task.success.keys()) # Validate cas for purged items keys_with_cas = list() for key, result in doc_op_task.fail.items(): if result['cas'] != 0: keys_with_cas.append(key) if len(keys_with_cas) != 0: self.fail("Following failed keys has CAS: %s" % keys_with_cas) # Recreate all docs without any node issues doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", 0, batch_size=10, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("10. Validating docs exists after creation") if len(doc_op_task.fail.keys()) != 0: self.fail("Doc recreate failed for keys: %s" % doc_op_task.fail.keys()) # Final doc_count validation self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
def getVbucketNumbers(shell_conn, bucket_name, replica_type): cb_stats = Cbstats(shell_conn) return cb_stats.vbucket_list(bucket_name, replica_type)
def test_index_with_aborts(self): """ 1. Create index (2i/view) on default bucket 2. Load multiple docs such that all sync_writes will be aborted 3. Verify nothing went into indexing 4. Load sync_write docs such that they are successful 5. Validate the mutated docs are taken into indexing :return: """ crud_batch_size = 50 def_bucket = self.cluster.buckets[0] kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) replica_vbs = dict() verification_dict = dict() index_item_count = dict() expected_num_indexed = dict() load_gen = dict() load_gen["ADD"] = dict() load_gen["SET"] = dict() partial_aborts = ["initial_aborts", "aborts_at_end"] durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) if self.create_index_during == "before_doc_ops": self.create_gsi_indexes(def_bucket) curr_items = self.bucket_util.get_bucket_current_item_count( self.cluster, def_bucket) if self.sync_write_abort_pattern in ["all_aborts", "initial_aborts"]: self.bucket_util.flush_bucket(self.cluster, def_bucket) self.num_items = 0 else: self.num_items = curr_items self.log.info("Disabling auto_failover to avoid node failures") status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120) self.assertTrue(status, msg="Failure during disabling auto-failover") # Validate vbucket stats verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 # verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 index_item_count["#primary"] = self.num_items index_item_count["durable_add_aborts"] = 0 index_item_count["durable_set_aborts"] = 0 expected_num_indexed["#primary"] = curr_items expected_num_indexed["durable_add_aborts"] = 0 expected_num_indexed["durable_set_aborts"] = 0 if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Loading docs such that all sync_writes will be aborted") for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(server) replica_vbs[server] = cbstats.vbucket_list(def_bucket.name, "replica") load_gen["ADD"][server] = list() load_gen["ADD"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) if self.sync_write_abort_pattern in partial_aborts: load_gen["ADD"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) verification_dict["ops_create"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size index_item_count["#primary"] += crud_batch_size index_item_count["durable_add_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["ADD"][server], self.cluster, def_bucket, self.durability_level, DocLoading.Bucket.DocOps.CREATE, self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") verification_dict["sync_write_aborted_count"] += \ crud_batch_size if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) load_gen["SET"][server] = list() load_gen["SET"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) if self.sync_write_abort_pattern in partial_aborts: load_gen["SET"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) verification_dict["ops_update"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size index_item_count["durable_add_aborts"] -= crud_batch_size index_item_count["durable_set_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size expected_num_indexed["durable_set_aborts"] += crud_batch_size verification_dict["sync_write_aborted_count"] += \ crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["SET"][server], self.cluster, def_bucket, self.durability_level, DocLoading.Bucket.DocOps.UPDATE, self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") ssh_shell.disconnect() if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) failed = durability_helper.verify_vbucket_details_stats( def_bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification failed") self.validate_test_failure() if self.create_index_during == "after_doc_ops": self.create_gsi_indexes(def_bucket) self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Verify aborts are not indexed") self.validate_indexed_count_from_stats(def_bucket, expected_num_indexed, index_item_count) if not self.use_gsi_for_primary: self.log.info("Wait of any indexing_activity to complete") index_monitor_task = self.cluster_util.async_monitor_active_task( self.cluster.master, "indexer", "_design/ddl_#primary", num_iteration=20, wait_task=True)[0] self.task_manager.get_task_result(index_monitor_task) self.assertTrue(index_monitor_task.result, "Indexer task still running on server") for server in kv_nodes: if self.sync_write_abort_pattern == "initial_aborts": load_gen["ADD"][server] = load_gen["ADD"][server][:1] load_gen["SET"][server] = load_gen["SET"][server][:1] elif self.sync_write_abort_pattern == "aborts_at_end": load_gen["ADD"][server] = load_gen["ADD"][server][-1:] load_gen["SET"][server] = load_gen["SET"][server][-1:] self.log.info("Load sync_write docs such that they are successful") for server in kv_nodes: for gen_load in load_gen["ADD"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") index_item_count["#primary"] += crud_batch_size index_item_count["durable_add_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size self.validate_indexed_doc_count(def_bucket, index_item_count) for gen_load in load_gen["SET"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "update", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") index_item_count["durable_add_aborts"] -= crud_batch_size index_item_count["durable_set_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size expected_num_indexed["durable_set_aborts"] += crud_batch_size self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Validate the mutated docs are taken into indexing") self.validate_indexed_count_from_stats(def_bucket, expected_num_indexed, index_item_count) self.validate_test_failure()
def test_flush_bucket_during_rollback(self): ''' Test focus: Stopping persistence one by one on all nodes, and trigger roll back on other nodes, During rollback flush the data Above step will be done num_rollback (variable defined in test) times STEPS: -- Ensure creation of at least a single state file -- Below steps will be repeated on all nodes, with stopping peristence on one at a time -- Stop persistence on node x -- Start load on node x for a given duration(self.duration * 60 seconds) -- Above step ensures creation of new state files (# equal to self.duration) -- Kill MemCached on Node x -- Trigger roll back on other/replica nodes -- ReStart persistence on Node -x -- Repeat all the above steps for num_rollback times ''' self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") items = copy.deepcopy(self.init_items_per_collection) mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.duration = self.input.param("duration", 2) self.num_rollbacks = self.input.param("num_rollbacks", 3) ####################################################################### ''' STEP - 1, Ensures creation of at least one snapshot To ensure at least one snapshot should get created before rollback starts, we need to sleep for 60 seconds as per magma design which create state file every 60s ''' self.sleep(60, "Ensures creation of at least one snapshot") ####################################################################### ''' STEP - 2, Stop persistence on node - x ''' for i in range(1, self.num_rollbacks+1): self.log.info("Roll back Iteration == {}".format(i)) start = items for x, node in enumerate(self.cluster.nodes_in_cluster): shell = RemoteMachineShellConnection(node) cbstats = Cbstats(shell) self.target_vbucket = cbstats.vbucket_list(self.cluster.buckets[0]. name) mem_item_count = 0 # Stopping persistence on Node-x self.log.debug("Iteration == {}, Stopping persistence on Node-{}, ip ={}" .format(i, x+1, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ############################################################### ''' STEP - 3 -- Load documents on node x for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None time_end = time.time() + 60 * self.duration itr = 0 while time.time() < time_end: itr += 1 time_start = time.time() mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter if time.time() < time_start + 60: self.log.info("Rollback Iteration== {}, itr== {}, Active-Node=={}, Node=={}".format(i, itr, x+1, node)) self.sleep(time_start + 60 - time.time(), "Sleep to ensure creation of state files for roll back") self.log.info("state files == {}".format( self.get_state_files(self.buckets[0]))) ep_queue_size_map = {node: mem_item_count} if self.durability_level: self.log.info("updating the num_items on disk check to double due to durability") ep_queue_size_map = {node: mem_item_count * 2} vb_replica_queue_size_map = {node: 0} for nod in self.cluster.nodes_in_cluster: if nod != node: ep_queue_size_map.update({nod: 0}) vb_replica_queue_size_map.update({nod: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=1200) self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=1200) # replica vBuckets for bucket in self.cluster.buckets: self.log.debug(cbstats.failover_stats(bucket.name)) ############################################################### ''' STEP - 4 -- Kill Memcached on Node - x and trigger rollback on other nodes -- After 20 seconds , flush bucket ''' shell.kill_memcached() self.sleep(20, "sleep after killing memcached") self.bucket_util.flush_bucket(self.cluster, self.cluster.buckets[0]) ############################################################### ''' STEP -5 -- Restarting persistence on Node -- x ''' self.assertTrue(self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) self.log.debug("Iteration=={}, Re-Starting persistence on Node -- {}".format(i, node)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep(5, "Sleep after re-starting persistence, Iteration{}".format(i)) shell.disconnect() ################################################################### ''' STEP - 6 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' self.create_start = 0 self.create_end = self.init_items_per_collection self.generate_docs(doc_ops="create", target_vbucket=None) self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets, timeout=1200)
def rebalance_out_with_queries(self): num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", False) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = dict() query["connectionTimeout"] = 60000 query["full_set"] = "true" views = list() tasks = list() if self.test_abort_snapshot: self.log.info("Creating sync_write abort scenario for replica vbs") for server in self.cluster_util.get_kv_nodes(self.cluster): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.cluster.buckets[0].name, "replica") load_gen = doc_generator(self.key, 0, 5000, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.cluster.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() for bucket in self.cluster.buckets: temp = self.bucket_util.make_default_views( self.default_view, self.default_view_name, num_views, is_dev_ddoc) temp_tasks = self.bucket_util.async_create_views( self.cluster.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = None if self.active_resident_threshold == 0: timeout = max(self.wait_timeout * 4, len(self.cluster.buckets) * self.wait_timeout * self.num_items / 50000) for task in tasks: self.task.jython_task_manager.get_task_result(task) for bucket in self.cluster.buckets: for view in views: # run queries to create indexes self.bucket_util.query_view(self.cluster.master, prefix + ddoc_name, view.name, query) active_tasks = self.cluster_util.async_monitor_active_task( self.cluster.servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: self.task_manager.get_task_result(active_task) self.assertTrue(active_task.result) expected_rows = self.num_items if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.cluster.buckets: self.bucket_util.perform_verify_queries( self.cluster.master, num_views, prefix, ddoc_name, self.default_view_name, query, expected_rows, bucket=bucket, wait_time=timeout) servs_out = self.cluster.servers[-self.nodes_out:] rebalance = self.task.async_rebalance([self.cluster.master], [], servs_out) self.sleep(self.wait_timeout / 5) # see that the result of view queries are the same as expected during the test for bucket in self.cluster.buckets: self.bucket_util.perform_verify_queries( self.cluster.master, num_views, prefix, ddoc_name, self.default_view_name, query, expected_rows, bucket=bucket, wait_time=timeout) # verify view queries results after rebalancing self.task.jython_task_manager.get_task_result(rebalance) self.assertTrue(rebalance.result, "Rebalance Failed") self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) for bucket in self.cluster.buckets: self.bucket_util.perform_verify_queries( self.cluster.master, num_views, prefix, ddoc_name, self.default_view_name, query, expected_rows, bucket=bucket, wait_time=timeout) if not self.atomicity: self.bucket_util.verify_cluster_stats(self.cluster, self.num_items, timeout=self.wait_timeout) self.bucket_util.verify_unacked_bytes_all_buckets(self.cluster)
def test_rollback_during_compaction(self): ''' ''' self.assertTrue(self.rest.update_autofailover_settings(False, 600), "AutoFailover disabling failed") items = copy.deepcopy(self.num_items) mem_only_items = self.input.param("rollback_items", 10000) ops_len = len(self.doc_ops.split(":")) if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas in the cluster/bucket \ to test rollback") self.num_rollbacks = self.input.param("num_rollbacks", 1) ####################################################################### ''' STEP - 1, Stop persistence on node - x ''' for i in range(1, self.num_rollbacks+1): self.log.info("Roll back Iteration == {}".format(i)) start = items shell = RemoteMachineShellConnection(self.cluster.master) cbstats = Cbstats(self.cluster.master) self.target_vbucket = cbstats.vbucket_list(self.cluster.buckets[0]. name) mem_item_count = 0 # Stopping persistence on Node-x self.log.debug("Stopping persistence on Node-{}" .format(self.cluster.master)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "stop") ############################################################### ''' STEP - 3 -- Load documents on node x for self.duration * 60 seconds -- This step ensures new state files (number equal to self.duration) ''' self.compute_docs(start, mem_only_items) self.gen_create = None self.gen_update = None self.gen_delete = None self.gen_expiry = None mem_item_count += mem_only_items * ops_len self.generate_docs(doc_ops=self.doc_ops, target_vbucket=self.target_vbucket) self.loadgen_docs(_sync=True, retry_exceptions=self.retry_exceptions, ignore_exceptions=self.ignore_exceptions) if self.gen_create is not None: self.create_start = self.gen_create.key_counter if self.gen_update is not None: self.update_start = self.gen_update.key_counter if self.gen_delete is not None: self.delete_start = self.gen_delete.key_counter if self.gen_expiry is not None: self.expiry_start = self.gen_expiry.key_counter ep_queue_size_map = {self.cluster.nodes_in_cluster[0]: mem_item_count} if self.durability_level: self.log.info("updating the num_items on disk check to double due to durability") ep_queue_size_map = {self.cluster.nodes_in_cluster[0]: mem_item_count * 2} vb_replica_queue_size_map = {self.cluster.nodes_in_cluster[0]: 0} for node in self.cluster.nodes_in_cluster[1:]: ep_queue_size_map.update({node: 0}) vb_replica_queue_size_map.update({node: 0}) #for bucket in self.cluster.buckets: # self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, # timeout=1200) # self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, # cbstat_cmd="all", # stat_name="vb_replica_queue_size", # timeout=1200) # replica vBuckets #for bucket in self.cluster.buckets: # self.log.debug(cbstats.failover_stats(bucket.name)) ############################################################### ''' STEP - 4 -- Kill Memcached on master node and trigger rollback on other nodes ''' if self.compact_before: compaction_tasks=[] for bucket in self.cluster.buckets: compaction_tasks.append(self.task.async_compact_bucket(self.cluster.master, bucket)) shell.kill_memcached() if self.compact_after: self.bucket_util._run_compaction(self.cluster, number_of_times=1) if self.compact_before: for task in compaction_tasks: self.task_manager.get_task_result(task) self.assertTrue(self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 10)) ############################################################### ''' STEP -5 -- Restarting persistence on Node -- x ''' self.log.debug("RollBack Iteration=={}, Re-Starting persistence on Node -- {}".format(i, self.cluster.master)) Cbepctl(shell).persistence(self.cluster.buckets[0].name, "start") self.sleep(5, "Sleep after re-starting persistence, Iteration{}".format(i)) for nod in self.cluster.nodes_in_cluster: ep_queue_size_map.update({nod: 0}) vb_replica_queue_size_map.update({nod: 0}) for bucket in self.cluster.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=600) self.bucket_util._wait_for_stat(bucket, vb_replica_queue_size_map, cbstat_cmd="all", stat_name="vb_replica_queue_size", timeout=600) shell.disconnect() ################################################################### ''' STEP - 6 -- Load Docs on all the nodes -- Loading of doc for 60 seconds -- Ensures creation of new state file ''' self.create_start = items self.create_end = items + 100000 self.generate_docs(doc_ops="create", target_vbucket=None) self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, _sync=True, doc_ops="create") self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=1200) items = items + 100000 self.log.debug("Iteration == {}, Total num_items {}".format(i, items))
def setup_for_test(self, skip_data_loading=False): if not skip_data_loading: # Load Couchbase bucket first self.perform_doc_ops_in_all_cb_buckets( "create", 0, self.num_items, durability=self.durability_level) self.bucket_util.verify_stats_all_buckets(self.num_items) if self.test_abort_snapshot: self.log.info("Creating sync_write aborts before dataset creation") for server in self.cluster_util.get_kv_nodes(): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.bucket_util.buckets[0].name, "replica") load_gen = doc_generator("test_abort_key", 0, self.num_items, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.bucket_util.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() # Create dataset on the CBAS bucket self.cbas_util.create_dataset_on_bucket( cbas_bucket_name=self.cb_bucket_name, cbas_dataset_name=self.cbas_dataset_name) if self.test_abort_snapshot: self.log.info("Creating sync_write aborts after dataset creation") for server in self.cluster_util.get_kv_nodes(): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.bucket_util.buckets[0].name, "replica") load_gen = doc_generator("test_abort_key", 0, self.num_items, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.bucket_util.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() # Create indexes on the CBAS bucket self.create_secondary_indexes = \ self.input.param("create_secondary_indexes", True) if self.create_secondary_indexes: self.index_fields = "profession:string,number:bigint" create_idx_statement = "create index {0} on {1}({2});".format( self.index_name, self.cbas_dataset_name, self.index_fields) status, metrics, errors, results, _ = \ self.cbas_util.execute_statement_on_cbas_util( create_idx_statement) self.assertTrue(status == "success", "Create Index query failed") self.assertTrue( self.cbas_util.verify_index_created( self.index_name, self.index_fields.split(","), self.cbas_dataset_name)[0]) # Connect to Bucket self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) if self.test_abort_snapshot: self.log.info("Creating sync_write aborts after dataset connect") for server in self.cluster_util.get_kv_nodes(): ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(ssh_shell) replica_vbs = cbstats.vbucket_list( self.bucket_util.buckets[0].name, "replica") load_gen = doc_generator("test_abort_key", 0, self.num_items, target_vbucket=replica_vbs) success = self.bucket_util.load_durable_aborts( ssh_shell, [load_gen], self.bucket_util.buckets[0], self.durability_level, "update", "all_aborts") if not success: self.log_failure("Simulating aborts failed") ssh_shell.disconnect() self.validate_test_failure() if not skip_data_loading: # Validate no. of items in CBAS dataset if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items): self.fail("No. of items in CBAS dataset do not match " "that in the CB bucket")
def common_test_body(self, failover_reason, rebalance_type=None): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Variable to decide the durability outcome durability_will_fail = False # Variable to track the number of nodes failed num_nodes_failed = 1 # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 \ and (self.graceful or self.recoveryType is not None): self.log.error( "Can't apply graceful failover to nodes with version < 3.*") self.log.error("Please check configuration params: SKIPPING TEST") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and # check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validation record_static_data_set = {} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) target_bucket = self.bucket_util.buckets[0] # Update new_replica value, if provided in the conf if self.new_replica: self.num_replicas = self.new_replica bucket_helper = BucketHelper(self.master) bucket_helper.change_bucket_props(target_bucket.name, replicaNumber=self.num_replicas) # Decide whether the durability is going to fail or not if self.num_failed_nodes >= 1 and self.num_replicas > 1: durability_will_fail = True # Construct target vbucket list from the nodes # which are going to be failed over vbucket_list = list() for target_node in self.chosen: shell_conn = RemoteMachineShellConnection(target_node) cb_stats = Cbstats(shell_conn) vbuckets = cb_stats.vbucket_list(target_bucket.name, self.target_vbucket_type) shell_conn.disconnect() vbucket_list += vbuckets # Code to generate doc_loaders that will work on vbucket_type # based on targeted nodes. This will perform CRUD only on # vbuckets which will be affected by the failover self.gen_create = doc_generator(self.key, self.num_items, self.num_items * 1.5, target_vbucket=vbucket_list) self.gen_update = doc_generator(self.key, self.num_items / 2, self.num_items, target_vbucket=vbucket_list) self.gen_delete = doc_generator(self.key, self.num_items / 4, self.num_items / 2 - 1, target_vbucket=vbucket_list) self.afterfailover_gen_create = doc_generator( self.key, self.num_items * 1.6, self.num_items * 2, target_vbucket=vbucket_list) self.afterfailover_gen_update = doc_generator( self.key, 1, self.num_items / 4, target_vbucket=vbucket_list) self.afterfailover_gen_delete = doc_generator( self.key, self.num_items * 0.5, self.num_items * 0.75, target_vbucket=vbucket_list) # Perform Add Back Operation with Rebalance # or only Rebalance with verifications if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.failover_onebyone: # Reset it back to False durability_will_fail = False for node_chosen in self.chosen: if num_nodes_failed > 1: durability_will_fail = True if self.add_back_flag: # In add-back case, durability should never fail, since # the num_nodes in the cluster will remain the same self.run_add_back_operation_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) num_nodes_failed += 1 else: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) else: return # Will verify_unacked_bytes only if the durability is not going to fail if self.during_ops is None and not durability_will_fail: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list)
def test_create_remove_collection_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after collection action 3. Initiate collection creation/deletion under the bucket 4. Validate the outcome of collection creation/deletion """ def create_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.create_collection(collection, scope) self.bucket_util.create_collection_object(bucket_obj, scope, {"name": collection}) elif client_type == "rest": self.bucket_util.create_collection(self.cluster.master, bucket_obj, scope, {"name": collection}) else: self.log_failure("Invalid client_type provided") def remove_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.drop_collection(scope, collection) self.bucket_util.mark_collection_as_dropped(bucket_obj, scope, collection) elif client_type == "rest": self.bucket_util.drop_collection(self.cluster.master, bucket_obj, scope, collection) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None task = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) if self.scope_name != CbServer.default_scope: self.scope_name = \ BucketUtils.get_random_name( max_length=CbServer.max_scope_name_len) self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = \ BucketUtils.get_random_name( max_length=CbServer.max_collection_name_len) # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] client = self.sdk_client_pool.get_client_for_bucket(self.bucket) use_client = sample(["sdk", "rest"], 1)[0] if action == "remove" \ and self.collection_name != CbServer.default_collection: # Create a collection to be removed create_collection(use_client, self.bucket, self.scope_name, self.collection_name) # Create a error scenario self.log.info("Selected scenario for test '%s'" % crash_type) shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) if action == "create": create_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) elif action == "remove": remove_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def online_swap(self, node_to_upgrade, version): vb_details = dict() vb_verification = dict() vb_types = ["active", "replica"] # Fetch active services on node_to_upgrade rest = self.__get_rest_node(node_to_upgrade) services = rest.get_nodes_services() services_on_target_node = services[(node_to_upgrade.ip + ":" + node_to_upgrade.port)] # Record vbuckets in swap_node if "kv" in services_on_target_node: shell = RemoteMachineShellConnection(node_to_upgrade) cbstats = Cbstats(shell) for vb_type in vb_types: vb_details[vb_type] = \ cbstats.vbucket_list(self.bucket.name, vb_type) shell.disconnect() # Install target version on spare node self.install_version_on_node([self.spare_node], version) # Fetch node not going to be involved in upgrade rest.add_node(self.creds.rest_username, self.creds.rest_password, self.spare_node.ip, self.spare_node.port, services=services_on_target_node) eject_otp_node = self.__get_otp_node(rest, node_to_upgrade) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[eject_otp_node.id]) self.sleep(5, "Wait for rebalance to start") rebalance_passed = rest.monitorRebalance() if not rebalance_passed: self.log_failure( "Swap rebalance failed during upgrade of {0}".format( node_to_upgrade)) return # VBuckets shuffling verification if "kv" in services_on_target_node: # Fetch vbucket stats after swap rebalance for verification shell = RemoteMachineShellConnection(self.spare_node) cbstats = Cbstats(shell) for vb_type in vb_types: vb_verification[vb_type] = \ cbstats.vbucket_list(self.bucket.name, vb_type) shell.disconnect() # Check vbuckets are shuffled or not for vb_type in vb_types: if vb_details[vb_type].sort() \ != vb_verification[vb_type].sort(): self.log_failure( "%s vbuckets shuffled post swap_rebalance" % vb_type) self.log.error("%s vbuckets before vs after: %s != %s" % (vb_type, vb_details[vb_type], vb_verification[vb_type])) # Update spare_node to rebalanced-out node self.spare_node = node_to_upgrade