def perform_operation_during_bucket_warmup(self, during_warmup="default"): # stop memcached in master node shell_conn = RemoteMachineShellConnection(self.cluster.master) self.error_sim = CouchbaseError(self.log, shell_conn) self.error_sim.create(CouchbaseError.STOP_MEMCACHED) self.log.info("memcached stopped on master node") if during_warmup == "create_scope": self.scope_name = self.bucket_util.get_random_name() self.create_scope() self.log.info("drop scope succeeded") elif during_warmup == "drop_scope": retry = 5 while retry > 0: scope_dict = self.bucket_util.get_random_scopes( self.bucket_util.buckets, 1, 1) self.scope_name = scope_dict[ self.bucket.name]["scopes"].keys()[0] if self.scope_name != "_default": break retry -= 1 self.drop_scope() self.log.info("drop scope succeeded") elif during_warmup == "create_collection": self.collection_name = self.bucket_util.get_random_name() self.create_collection() self.log.info("create collection succeeded") elif during_warmup == "drop_collection": collections = self.bucket_util.get_random_collections( self.bucket_util.buckets, 1, 1, 1) scope_dict = collections[self.bucket.name]["scopes"] self.scope_name = scope_dict.keys()[0] self.collection_name = scope_dict[ self.scope_name]["collections"].keys()[0] self.drop_collection() self.log.info("drop collection succeeded") else: try: self.random_load() self.log_failure("random operation succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.random_load() self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_with_server_stopped(self): """ 1. Disable auto-failover in the cluster 2. Stop few servers on the cluster 3. Run cb_collect_info on all nodes 4. Make sure cb_collect works for stopped nodes as well """ service_to_stop = self.input.param("affect_nodes_with_service", "kv").split(";") num_nodes_to_affect = self.input.param("num_nodes_to_affect", 1) nodes_in_cluster = self.__get_server_nodes() nodes_to_stop = sample(self.__get_server_nodes(service_to_stop), num_nodes_to_affect) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") self.log.info("Nodes to stop - %s" % nodes_to_stop) for node in nodes_to_stop: cb_error = CouchbaseError(self.log, self.node_data[node]["shell"]) self.node_data[node]["cb_error"] = cb_error self.node_data[node]["cb_error"].create(CouchbaseError.STOP_SERVER) for node in nodes_in_cluster: self.node_data[node]["cb_collect_task"] = Thread( target=self.cluster_util.run_cb_collect, args=[node, self.node_data[node]["cb_collect_file"]], kwargs={ "options": "", "result": self.node_data[node]["cb_collect_result"] }) self.node_data[node]["cb_collect_task"].start() for node in nodes_in_cluster: try: t_node = self.node_data[node] t_node["cb_collect_task"].join(300) if str(t_node["cb_collect_result"]["file_size"]) == "0": self.log_failure("%s - cbcollect file size is zero" % node.ip) except RuntimeError as e: self.log_failure("%s cbcollect_info timed-out: %s" % (node.ip, e)) # Restarting stopped nodes for node in nodes_to_stop: self.node_data[node]["cb_error"].revert(CouchbaseError.STOP_SERVER) self.bucket_util.is_warmup_complete(self.bucket_util.buckets) self.validate_test_failure()
def induce_and_revert_failure(self, action): target_node = self.servers[-1] # select last node remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_sim.create(action) self.sleep(20, "Wait before reverting the error condition") if action in [CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_PROMETHEUS]: # Revert the simulated error condition explicitly. In kill memcached, prometheus # babysitter will bring back the process automatically error_sim.revert(action) remote.disconnect()
def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect()
def test_with_sync_write(self): cluster_node = choice(self.kv_nodes) target_vb_type, simulate_error = \ DurabilityHelper.get_vb_and_error_type(self.durability_level) doc_gen = doc_generator( self.key, 0, 2, target_vbucket=self.node_data[cluster_node]["%s_vbs" % target_vb_type]) client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) key_1, value_1 = doc_gen.next() key_2, value_2 = doc_gen.next() if self.doc_ops[0] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_1, value_1) if self.doc_ops[1] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_2, value_2) sync_op = Thread(target=self.crud, args=[client, self.doc_ops[0], key_1], kwargs={ "value": value_1, "durability": self.durability_level, "expected_thread_val": 1 }) async_op = Thread(target=self.crud, args=[client, self.doc_ops[1], key_2], kwargs={ "value": value_2, "expected_thread_val": 0 }) cb_err = CouchbaseError(self.log, self.node_data[cluster_node]["shell"]) cb_err.create(simulate_error, self.bucket.name) # Start doc_ops sync_op.start() self.sleep(1, "Wait before async operation") async_op.start() # Wait for ops to complete async_op.join() cb_err.revert(simulate_error, self.bucket.name) sync_op.join() self.validate_test_failure()
def test_prometheus_and_ns_server_stats_after_crash_scenarios(self): """ Run all metrics before and after crash and validate both ns_server and prometheus stats """ self.bucket_util.load_sample_bucket(self.cluster, TravelSample()) target_node = self.servers[0] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) self.log.info("Before failure") self.get_all_metrics(self.components, self.parse, self.metric_name) try: self.log.info("Killing {0} on node {1}".format( self.process_name, target_node.ip)) remote.kill_process(self.process_name, self.service_name, signum=signum[self.sig_type]) self.sleep(20, "Wait for the process to come backup") finally: remote.disconnect() self.log.info("After failure") self.get_all_metrics(self.components, self.parse, self.metric_name)
def MB36948(self): node_to_stop = self.servers[0] self.log.info("Adding index/query node") self.task.rebalance([self.cluster.master], [self.servers[2]], [], services=["n1ql,index"]) self.log.info("Creating SDK client connection") client = SDKClient([self.cluster.master], self.bucket_util.buckets[0], compression_settings=self.sdk_compression) self.log.info("Stopping memcached on: %s" % node_to_stop) ssh_conn = RemoteMachineShellConnection(node_to_stop) err_sim = CouchbaseError(self.log, ssh_conn) err_sim.create(CouchbaseError.STOP_MEMCACHED) result = client.crud("create", "abort1", "abort1_val") if not result["status"]: self.log_failure("Async SET failed") result = client.crud("update", "abort1", "abort1_val", durability=self.durability_level, timeout=3, time_unit="seconds") if result["status"]: self.log_failure("Sync write succeeded") if SDKException.DurabilityAmbiguousException not in result["error"]: self.log_failure("Invalid exception for sync_write: %s" % result) self.log.info("Resuming memcached on: %s" % node_to_stop) err_sim.revert(CouchbaseError.STOP_MEMCACHED) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(1) self.log.info("Closing ssh & SDK connections") ssh_conn.disconnect() client.close() self.validate_test_failure()
def test_prometheus_and_ns_server_stats_after_failure_scenarios(self): """ Run all metrics before and after failure scenarios and validate both ns_server and prometheus stats """ self.bucket_util.load_sample_bucket(self.cluster, TravelSample()) target_node = self.servers[0] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) self.log.info("Before failure") self.get_all_metrics(self.components, self.parse, self.metric_name) try: # Induce the error condition error_sim.create(self.simulate_error) self.sleep(20, "Wait before reverting the error condition") finally: # Revert the simulated error condition and close the ssh session error_sim.revert(self.simulate_error) remote.disconnect() self.log.info("After failure") self.get_all_metrics(self.components, self.parse, self.metric_name)
def test_create_remove_collection_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after collection action 3. Initiate collection creation/deletion under the bucket 4. Validate the outcome of collection creation/deletion """ def create_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.create_collection(collection, scope) self.bucket_util.create_collection_object(bucket_obj, scope, {"name": collection}) elif client_type == "rest": self.bucket_util.create_collection(self.cluster.master, bucket_obj, scope, {"name": collection}) else: self.log_failure("Invalid client_type provided") def remove_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.drop_collection(scope, collection) self.bucket_util.mark_collection_as_dropped(bucket_obj, scope, collection) elif client_type == "rest": self.bucket_util.drop_collection(self.cluster.master, bucket_obj, scope, collection) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None task = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) if self.scope_name != CbServer.default_scope: self.scope_name = \ BucketUtils.get_random_name( max_length=CbServer.max_scope_name_len) self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = \ BucketUtils.get_random_name( max_length=CbServer.max_collection_name_len) # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] client = self.sdk_client_pool.get_client_for_bucket(self.bucket) use_client = sample(["sdk", "rest"], 1)[0] if action == "remove" \ and self.collection_name != CbServer.default_collection: # Create a collection to be removed create_collection(use_client, self.bucket, self.scope_name, self.collection_name) # Create a error scenario self.log.info("Selected scenario for test '%s'" % crash_type) shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) if action == "create": create_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) elif action == "remove": remove_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) def_bucket = self.bucket_util.buckets[0] target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = self.getVbucketNumbers(remote, def_bucket.name, self.target_node) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return # Create doc_generator targeting only the active/replica vbuckets # present in the target_node gen_load = doc_generator(self.key, self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_load, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) else: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", exp=0, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, skip_read_on_error=True) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(task) if not self.atomicity: if len(task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Create SDK connection for CRUD retries sdk_client = SDKClient([self.cluster.master], def_bucket) for doc_key, crud_result in task.fail.items(): result = sdk_client.crud("create", doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection sdk_client.close() # Update self.num_items self.num_items += self.new_docs_to_add if not self.atomicity: # Validate doc count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.validate_test_failure()
def test_timeout_with_successful_crud(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side. This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operation succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() doc_gen = dict() vb_info = dict() vb_info["init"] = dict() vb_info["afterCrud"] = dict() vb_info["withinTimeout"] = dict() target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) doc_gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size) doc_gen["read"] = sub_doc_generator(self.key, self.num_items / 4, self.num_items / 2, key_size=self.key_size) doc_gen["upsert"] = sub_doc_generator_for_edit(self.key, self.num_items / 4, self.num_items / 2, key_size=self.key_size, template_index=2) doc_gen["remove"] = sub_doc_generator_for_edit(self.key, 0, self.num_items / 4, key_size=self.key_size, template_index=2) for op_type in doc_gen.keys(): self.log.info("Performing '%s' with timeout=%s" % (op_type, self.sdk_timeout)) doc_load_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, self.maxttl, path_create=True, batch_size=500, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait before reverting the error condition") # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) self.task_manager.get_task_result(doc_load_task) if len(doc_load_task.fail.keys()) != 0: if op_type == "read": self.log.warning("Read failed for %d keys: %s" % (len( doc_load_task.fail.keys()), doc_load_task.fail.keys())) else: self.log_failure("Failures during %s operation: %s" % (op_type, doc_load_task.fail)) # Fetch latest stats and validate the values are updated for node in target_nodes: if op_type == "read": continue vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure( "vbucket_seqno not updated. {0} == {1}".format( vb_info["init"][node.ip], vb_info["afterCrud"][node.ip])) # # Retry failed docs (if any) # retry_failed = self.durability_helper.retry_with_no_error( # client, doc_load_task.fail, op_type) # if retry_failed: # self.log_failure(msg.format(op_type)) # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Read mutation field from all docs for validation gen_read = sub_doc_generator_for_edit(self.key, 0, self.num_items, key_size=self.key_size) gen_read.template = '{{ "mutated": "" }}' reader_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_read, "read", batch_size=50, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(reader_task) len_failed_keys = len(reader_task.fail.keys()) if len_failed_keys != 0: self.log_failure("Failures in read_task (%d): %s" % (len_failed_keys, reader_task.fail.keys())) for doc_key, crud_result in reader_task.success.items(): expected_val = 2 if int(doc_key.split('-')[1]) >= self.num_items / 2: expected_val = 1 if reader_task.success[doc_key]["value"][0] != expected_val: self.log_failure("Value mismatch for %s: %s" % (doc_key, crud_result)) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.validate_test_failure()
def test_rollback_n_times(self): doc_loading_task_2 = None ep_queue_size_map = dict() vb_replica_queue_size_map = dict() expected_num_items = \ self.bucket_util.get_expected_total_num_items(self.bucket) keys_to_verify = ["max_visible_seqno", "num_items", "high_completed_seqno", "purge_seqno"] if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") target_node = choice(self.kv_nodes) shell = self.node_shells[target_node]["shell"] error_sim = CouchbaseError(self.log, shell) cb_stats = self.node_shells[target_node]["cbstat"] self.target_vbuckets = cb_stats.vbucket_list(self.bucket.name) for _ in xrange(1, self.num_rollbacks + 1): self.total_rollback_items = 0 error_sim.create(CouchbaseError.STOP_PERSISTENCE, self.bucket.name) doc_loading_task_1 = self.load_docs(self.doc_ops) if self.rollback_with_multiple_mutation: doc_loading_task_2 = self.load_docs("update") for node in self.cluster.nodes_in_cluster: ep_queue_size = 0 if node.ip == target_node.ip: ep_queue_size = self.total_rollback_items if self.sync_write_enabled: # Includes prepare+commit mutation ep_queue_size *= 2 ep_queue_size_map.update({node: ep_queue_size}) vb_replica_queue_size_map.update({node: 0}) self.log.info("Validating stats") for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=self.wait_timeout) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size", timeout=self.wait_timeout) if self.rollback_with_multiple_mutation: self.__rewind_doc_index(doc_loading_task_2) self.__rewind_doc_index(doc_loading_task_1) error_sim.create(CouchbaseError.KILL_MEMCACHED) self.assertTrue(self.bucket_util._wait_warmup_completed( [target_node], self.bucket, wait_time=300)) self.bucket_util.verify_stats_all_buckets(expected_num_items, timeout=120) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback", keys_to_verify) self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_bulk_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" if doc_ops[0] == "create": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "update": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "replace": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "delete": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = "NONE" for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: # This will support both sync-write and non-sync-writes doc_loader_task_2 = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], doc_ops[1], 0, scope=s_name, collection=c_name, sdk_client_pool=self.sdk_client_pool, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=3, print_ops_rate=False, skip_read_on_error=True, task_identifier="parallel_task2") self.task.jython_task_manager.get_task_result( doc_loader_task_2) # Validation to verify the sync_in_write_errors # in doc_loader_task_2 failed_docs = doc_loader_task_2.fail if len(failed_docs.keys()) != 1: self.log_failure( "Exception not seen for docs: %s" % failed_docs) valid_exception = self.durability_helper\ .validate_durability_exception( failed_docs, SDKException.AmbiguousTimeoutException, retry_reason=sync_write_in_progress) if not valid_exception: self.log_failure("Got invalid exception") # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops[0] == "update": for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], "read", batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 \ and json.loads(str(doc_info["value"]) )["mutated"] != 1: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Validate doc_count per collection self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
def test_durability_abort(self): """ Test to validate durability abort is triggered properly with proper rollback on active vbucket :return: """ load_task = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(server) cb_err = CouchbaseError(self.log, ssh_shell) target_vb_type = "replica" if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" target_vbs = cbstats.vbucket_list(self.bucket.name, target_vb_type) doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbs doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] \ = self.durability_level doc_load_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.DurabilityAmbiguousException ] doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 2 doc_load_spec[MetaCrudParams.SKIP_READ_ON_ERROR] = True doc_load_spec[MetaCrudParams.SUPPRESS_ERROR_TABLE] = True cb_err.create(self.simulate_error, self.cluster.buckets[0].name) load_task[server] = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, batch_size=1, validate_task=False) cb_err.revert(self.simulate_error, self.cluster.buckets[0].name) ssh_shell.disconnect() self.validate_test_failure() failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification failed " "after aborts") self.validate_test_failure() # Retry aborted keys with healthy cluster self.log.info("Performing CRUDs on healthy cluster") for server in kv_nodes: self.bucket_util.validate_doc_loading_results(load_task[server]) if load_task[server].result is False: self.log_failure("Doc retry task failed on %s" % server.ip) # Update cbstat vb-details verification counters for bucket, s_dict in load_task[server].loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, _ in c_dict["collections"].items(): c_crud_data = load_task[server].loader_spec[bucket][ "scopes"][s_name]["collections"][c_name] for op_type in c_crud_data.keys(): total_mutation = \ c_crud_data[op_type]["doc_gen"].end \ - c_crud_data[op_type]["doc_gen"].start if op_type in DocLoading.Bucket.DOC_OPS: self.verification_dict["ops_%s" % op_type] \ += total_mutation self.verification_dict[ "sync_write_committed_count"] \ += total_mutation failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification " "failed after ops on server: %s" % server.ip) self.validate_test_failure()
def test_maxttl_with_timeout(self): """ 1. Stop Memcached on target_nodes based on replicas configured. 2. Initiate doc_ops with higher sdk_timeout 3. Sleep for time within the configured sdk_timeout 4. Resume Memcached on target_nodes to make sure doc_ops go through 5. Make sure maxTTL is calculated as soon as the active vbucket receives the mutation :return: """ shell_conn = dict() target_vbuckets = list() target_nodes = self.getTargetNodes() def_bucket = self.cluster.buckets[0] self.maxttl = self.input.param("doc_ttl", self.maxttl) # Open required SDK connections before error_simulation gen_create = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster.vbuckets) doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", self.maxttl, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, start_task=False, sdk_client_pool=self.sdk_client_pool) # Open shell_conn and create Memcached error for testing MaxTTL self.log.info("1. Stopping Memcached on target_nodes") for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstats = Cbstats(shell_conn[node.ip]) target_vbuckets += cbstats.vbucket_list(def_bucket.name, "replica") cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.create(CouchbaseError.STOP_MEMCACHED, def_bucket.name) self.log.info("2. Initiating the doc_ops with doc TTL") self.task_manager.add_new_task(doc_op_task) self.sleep(self.maxttl, "3. Sleep for max_ttl time") # Revert Memcached error and close the shell_conn self.log.info("4. Resuming Memcached on target_nodes") for node in target_nodes: cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.revert(CouchbaseError.STOP_MEMCACHED, def_bucket.name) shell_conn[node.ip].disconnect() self.log.info("5. Waiting for doc_ops to complete") self.task.jython_task_manager.get_task_result(doc_op_task) self.bucket_util._expiry_pager(self.cluster, val=1) self.sleep(10, "6. Waiting for items to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("7. Validating docs expired after TTL, " "even before sync_write succeeds") if len(doc_op_task.success.keys()) == self.num_items: self.fail("No docs deleted after MaxTTL time: %s" % doc_op_task.success.keys()) self.sleep(10, "8. Waiting for all docs to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("9. Validating docs expired after TTL") if len(doc_op_task.fail.keys()) != self.num_items: self.fail("Items not deleted after MaxTTL time: %s" % doc_op_task.success.keys()) # Validate cas for purged items keys_with_cas = list() for key, result in doc_op_task.fail.items(): if result['cas'] != 0: keys_with_cas.append(key) if len(keys_with_cas) != 0: self.fail("Following failed keys has CAS: %s" % keys_with_cas) # Recreate all docs without any node issues doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", 0, batch_size=10, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("10. Validating docs exists after creation") if len(doc_op_task.fail.keys()) != 0: self.fail("Doc recreate failed for keys: %s" % doc_op_task.fail.keys()) # Final doc_count validation self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
def test_sub_doc_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] self.load_data_for_sub_doc_ops() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() # load_spec["target_vbuckets"] = list(set(target_vbuckets) # ^ set(active_vbs_in_target_nodes)) load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 25 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 25 self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Sub_doc CRUDs failed with process crash") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats not updated after error condition" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def validate_durability_with_crud( self, bucket, bucket_durability, verification_dict, doc_start_index=0, num_items_to_load=1, op_type="create", doc_durability=Bucket.DurabilityLevel.NONE): """ Common API to validate durability settings of the bucket is set correctly or not. :param bucket: Bucket object to validate :param bucket_durability: Durability set for the bucket Note: Need this because the string within the bucket object is different than this. :param verification_dict: To hold the values for req cbstats to verify :param doc_start_index: Starting index to be considered for doc_load :param num_items_to_load: Number of items to be loaded to test. Default is '1' :param op_type: Type of CRUD to perform. Default is 'create' :param doc_durability: Document durability level to use during CRUD. Default level is 'None' :return: """ def get_d_level_used(): if self.d_level_order.index(bucket_durability) \ < self.d_level_order.index(doc_durability): return doc_durability return bucket_durability d_level_to_test = get_d_level_used() # Nothing to test for durability_level=None (async_write case) if d_level_to_test == Bucket.DurabilityLevel.NONE: return self.log.info("Performing %s operation to validate d_level %s" % (op_type, d_level_to_test)) # Can't simulate error conditions for all durability_levels. # So only perform CRUD without error_sim if len(self.vbs_in_node.keys()) > 1: # Pick a random node to perform error sim and load random_node = choice(self.vbs_in_node.keys()) target_vb_type, simulate_error = \ self.durability_helper.get_vb_and_error_type(d_level_to_test) doc_gen = doc_generator( self.key, doc_start_index, num_items_to_load, target_vbucket=self.vbs_in_node[random_node][target_vb_type]) error_sim = CouchbaseError(self.log, self.vbs_in_node[random_node]["shell"]) doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=doc_durability, timeout_secs=32, batch_size=1, skip_read_on_error=True, suppress_error_table=True, start_task=False, sdk_client_pool=self.sdk_client_pool) self.sleep(5, "Wait for sdk_client to get warmed_up") # Simulate target error condition error_sim.create(simulate_error) self.sleep(5, "Wait for error_sim to take effect") # Start doc_loading task and wait for it to complete self.task_manager.add_new_task(doc_load_task) self.task_manager.get_task_result(doc_load_task) # Revert the induced error condition self.sleep(5, "Wait before reverting error_simulation") error_sim.revert(simulate_error) # Validate failed doc count and exception type from SDK if not doc_load_task.fail.keys(): self.log_failure("Docs inserted without honoring the " "bucket durability level") for key, result in doc_load_task.fail.items(): if SDKException.DurabilityAmbiguousException \ not in str(result["error"]): self.log_failure("Invalid exception for key %s " "during %s operation: %s" % (key, op_type, result["error"])) verification_dict["sync_write_aborted_count"] += num_items_to_load else: doc_gen = doc_generator(self.key, doc_start_index, doc_start_index+num_items_to_load) # Retry the same CRUDs without any error simulation in place doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, durability=doc_durability, timeout_secs=2, batch_size=1, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(doc_load_task) if doc_load_task.fail: self.log_failure("Failures seen during CRUD without " "error simulation. Keys failed: %s" % doc_load_task.fail.keys()) else: verification_dict["ops_%s" % op_type] += \ num_items_to_load verification_dict["sync_write_committed_count"] += \ num_items_to_load
def test_update_durability_between_doc_op(self): """ 1. Create Bucket with durability level set. 2. Bring down a node such that durability CRUD will wait 3. Perform doc_op and update bucket_level_durability 4. Revert scenario induced in step#2, such that doc_op will complete 5. Make sure doc_ops in step#3 went through using prev. d-level """ # Starting from max_durability levels because to iterate # all lower levels for doc_ops with level update supported_d_levels = deepcopy(self.d_level_order) if self.bucket_type == Bucket.Type.EPHEMERAL: supported_d_levels = supported_d_levels[0:2] supported_d_levels.reverse() supported_d_levels += [supported_d_levels[0]] create_desc = "Creating %s bucket with level '%s'" \ % (self.bucket_type, supported_d_levels[0]) self.log.info(create_desc) bucket_dict = self.get_bucket_dict(self.bucket_type, supported_d_levels[0]) # Object to support performing CRUDs and create Bucket bucket_obj = Bucket(bucket_dict) self.bucket_util.create_bucket(self.cluster, bucket_obj, wait_for_warmup=True) self.get_vbucket_type_mapping(bucket_obj.name) self.summary.add_step(create_desc) self.bucket_util.print_bucket_stats(self.cluster) # Loop to update all other durability levels prev_d_level = supported_d_levels[0] for bucket_durability in supported_d_levels[1:]: target_vb_type, simulate_error = \ self.durability_helper.get_vb_and_error_type(bucket_durability) # Pick a random node to perform error sim and load random_node = choice(self.vbs_in_node.keys()) error_sim = CouchbaseError( self.log, self.vbs_in_node[random_node]["shell"]) target_vbs = self.vbs_in_node[random_node][target_vb_type] doc_gen = doc_generator(self.key, 0, 1, target_vbucket=target_vbs) doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket_obj, doc_gen, "update", durability=Bucket.DurabilityLevel.NONE, timeout_secs=60, start_task=False, sdk_client_pool=self.sdk_client_pool) # Simulate target error condition error_sim.create(simulate_error) self.sleep(5, "Wait before starting doc_op") self.task_manager.add_new_task(doc_load_task) new_d_level = BucketDurability[bucket_durability] self.sleep(5, "Wait before updating bucket level " "durability=%s" % new_d_level) self.bucket_util.update_bucket_property( self.cluster.master, bucket_obj, bucket_durability=new_d_level) self.bucket_util.print_bucket_stats(self.cluster) buckets = self.bucket_util.get_all_buckets(self.cluster) if buckets[0].durability_level != new_d_level: self.log_failure("Failed to update bucket_d_level to %s" % new_d_level) self.summary.add_step("Set bucket-durability=%s" % new_d_level) if prev_d_level == Bucket.DurabilityLevel.NONE: if not doc_load_task.completed: self.log_failure("Doc-op still pending for d_level 'NONE'") elif doc_load_task.completed: self.log_failure("Doc-op completed before reverting the " "error condition: %s" % simulate_error) # Revert the induced error condition error_sim.revert(simulate_error) self.task_manager.get_task_result(doc_load_task) if doc_load_task.fail: self.log_failure("Doc_op failed") self.summary.add_step("Doc_op with previous d_level %s" % prev_d_level) prev_d_level = bucket_durability # Delete the bucket on server self.bucket_util.delete_bucket(self.cluster, bucket_obj) self.summary.add_step("Delete %s bucket" % self.bucket_type)
def test_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops", "create;create").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() # Acquire SDK client from the pool for performing doc_ops locally client = SDKClient([self.cluster.master], self.bucket) target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = replica_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(replica_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.REPLACE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(3, "Wait for error simulation to take effect") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): client.select_collection(s_name, c_name) for op_type in c_meta: key, value = c_meta[op_type]["doc_gen"].next() if self.with_non_sync_writes: fail = client.crud(doc_ops[1], key, value, exp=0, timeout=2, time_unit="seconds") else: fail = client.crud( doc_ops[1], key, value, exp=0, durability=self.durability_level, timeout=2, time_unit="seconds") expected_exception = \ SDKException.AmbiguousTimeoutException retry_reason = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE \ and doc_ops[1] in \ [DocLoading.Bucket.DocOps.DELETE, DocLoading.Bucket.DocOps.REPLACE]: expected_exception = \ SDKException.DocumentNotFoundException retry_reason = None # Validate the returned error from the SDK if expected_exception not in str(fail["error"]): self.log_failure("Invalid exception for %s: %s" % (key, fail["error"])) if retry_reason \ and retry_reason not in str(fail["error"]): self.log_failure( "Invalid retry reason for %s: %s" % (key, fail["error"])) # Try reading the value in SyncWrite state fail = client.crud("read", key) if doc_ops[0] == "create": # Expected KeyNotFound in case of CREATE op if fail["status"] is True: self.log_failure( "%s returned value during SyncWrite %s" % (key, fail)) else: # Expects prev val in case of other operations if fail["status"] is False: self.log_failure( "Key %s read failed for prev value: %s" % (key, fail)) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Release the acquired SDK client client.close() self.validate_test_failure()
def test_scenario(bucket, doc_ops, with_sync_write_val=None): # Set crud_batch_size crud_batch_size = 4 simulate_error = CouchbaseError.STOP_MEMCACHED # Fetch target_vbs for CRUDs node_vb_info = self.vbs_in_node target_vbuckets = node_vb_info[target_nodes[0]]["replica"] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(node_vb_info[target_nodes[index]]["replica"])) ) index += 1 # Variable to hold one of the doc_generator objects gen_loader_1 = None gen_loader_2 = None # Initialize doc_generators to use for testing self.log.info("Creating doc_generators") gen_create = doc_generator( self.key, self.num_items, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_update = doc_generator( self.key, 0, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets, mutate=1) gen_delete = doc_generator( self.key, 0, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) self.log.info("Done creating doc_generators") # Start CRUD operation based on the given 'doc_op' type if doc_ops[0] == "create": self.num_items += crud_batch_size gen_loader_1 = gen_create elif doc_ops[0] in ["update", "replace", "touch"]: gen_loader_1 = gen_update elif doc_ops[0] == "delete": gen_loader_1 = gen_delete self.num_items -= crud_batch_size if doc_ops[1] == "create": gen_loader_2 = gen_create elif doc_ops[1] in ["update", "replace", "touch"]: gen_loader_2 = gen_update elif doc_ops[1] == "delete": gen_loader_2 = gen_delete # Load required docs for doc_op_1 in case of type != create if doc_op[2] == "load_initial_docs": doc_loading_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_loader_1, "create", 0, batch_size=crud_batch_size, process_concurrency=1, timeout_secs=10, print_ops_rate=False, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(doc_loading_task) if doc_loading_task.fail: self.log_failure("Failure while loading initial docs") self.summary.add_step("Create docs for %s" % doc_op[0]) verification_dict["ops_create"] += crud_batch_size verification_dict["sync_write_committed_count"] \ += crud_batch_size # Initialize tasks and store the task objects doc_loader_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_loader_1, doc_ops[0], 0, batch_size=crud_batch_size, process_concurrency=8, timeout_secs=60, print_ops_rate=False, start_task=False, sdk_client_pool=self.sdk_client_pool) # SDK client for performing individual ops client = SDKClient([self.cluster.master], bucket) # Perform specified action for node in target_nodes: error_sim = CouchbaseError(self.log, self.vbs_in_node[node]["shell"]) error_sim.create(simulate_error, bucket_name=bucket.name) self.sleep(5, "Wait for error simulation to take effect") self.task_manager.add_new_task(doc_loader_task) self.sleep(5, "Wait for task_1 CRUDs to reach server") # Perform specified CRUD operation on sync_write docs tem_gen = deepcopy(gen_loader_2) while tem_gen.has_next(): key, value = tem_gen.next() for retry_strategy in [ SDKConstants.RetryStrategy.FAIL_FAST, SDKConstants.RetryStrategy.BEST_EFFORT]: if with_sync_write_val: fail = client.crud(doc_ops[1], key, value=value, exp=0, durability=with_sync_write_val, timeout=3, time_unit="seconds", sdk_retry_strategy=retry_strategy) else: fail = client.crud(doc_ops[1], key, value=value, exp=0, timeout=3, time_unit="seconds", sdk_retry_strategy=retry_strategy) expected_exception = SDKException.AmbiguousTimeoutException retry_reason = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if retry_strategy == SDKConstants.RetryStrategy.FAIL_FAST: expected_exception = \ SDKException.RequestCanceledException retry_reason = \ SDKException.RetryReason \ .KV_SYNC_WRITE_IN_PROGRESS_NO_MORE_RETRIES # Validate the returned error from the SDK if expected_exception not in str(fail["error"]): self.log_failure("Invalid exception for {0}: {1}" .format(key, fail["error"])) if retry_reason not in str(fail["error"]): self.log_failure("Invalid retry reason for {0}: {1}" .format(key, fail["error"])) # Try reading the value in SyncWrite in-progress state fail = client.crud("read", key) if doc_ops[0] == "create": # Expected KeyNotFound in case of CREATE operation if fail["status"] is True: self.log_failure( "%s returned value during SyncWrite state: %s" % (key, fail)) else: # Expects prev value in case of other operations if fail["status"] is False: self.log_failure( "Key %s read failed for previous value: %s" % (key, fail)) # Revert the introduced error condition for node in target_nodes: error_sim = CouchbaseError(self.log, self.vbs_in_node[node]["shell"]) error_sim.revert(simulate_error, bucket_name=bucket.name) # Wait for doc_loader_task to complete self.task.jython_task_manager.get_task_result(doc_loader_task) verification_dict["ops_%s" % doc_op[0]] += crud_batch_size verification_dict["sync_write_committed_count"] \ += crud_batch_size # Disconnect the client client.close()
def test_sub_doc_sync_write_in_progress(self): """ Test to simulate sync_write_in_progress error and validate the behavior This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select nodes to simulate the error which will affect the durability 2. Enable the specified error_scenario on the selected nodes 3. Perform individual CRUDs and verify sync_write_in_progress errors 4. Validate the end results """ doc_ops = self.input.param("doc_ops", "insert") shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() vb_info["init"] = dict() doc_load_spec = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 amb_timeout = SDKException.AmbiguousTimeoutException kv_sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS doc_not_found_exception = SDKException.DocumentNotFoundException self.load_data_for_sub_doc_ops() doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 # Acquire SDK client from the pool for performing doc_ops locally client = self.sdk_client_pool.get_client_for_bucket(self.bucket) # Override the crud_batch_size self.crud_batch_size = 5 # Update mutation spec based on the required doc_operation if doc_ops == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops in DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.INSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.UPSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.REMOVE: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 1 # This is to support both sync-write and non-sync-writes tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = Bucket.DurabilityLevel.NONE # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error simulation to take effect") # Initialize tasks and store the task objects doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, mutation_num=2, batch_size=1, async_load=True) # Start the doc_loader_task self.sleep(10, "Wait for task_1 CRUDs to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: key, _ = c_meta[op_type]["doc_gen"].next() expected_exception = amb_timeout retry_reason = kv_sync_write_in_progress if doc_ops == "create": expected_exception = doc_not_found_exception retry_reason = None for sub_doc_op in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: val = ["my_mutation", "val"] if sub_doc_op \ == DocLoading.Bucket.SubDocOps.REMOVE: val = "mutated" result = client.crud(sub_doc_op, key, val, durability=tem_durability, timeout=2) if result[0]: self.log_failure("Doc crud succeeded for %s" % op_type) elif expected_exception \ not in str(result[1][key]["error"]): self.log_failure( "Invalid exception for key %s: %s" % (key, result[1][key]["error"])) elif retry_reason is not None and \ retry_reason \ not in str(result[1][key]["error"]): self.log_failure( "Retry reason missing for key %s: %s" % (key, result[1][key]["error"])) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loader_task_1 to complete self.task.jython_task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops == DocLoading.Bucket.DocOps.UPDATE: for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: c_meta[op_type]["doc_gen"].reset() read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], DocLoading.Bucket.DocOps.READ, batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 and \ json.loads(str(doc_info["value"]) )["mutated"] != 2: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Release the acquired SDK client self.sdk_client_pool.release_client(client) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def test_timeout_with_successful_crud(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side. This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operation succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() vb_info["init"] = dict() vb_info["afterCrud"] = dict() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) doc_load_spec = dict() doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = self.sdk_timeout doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] = \ "test_collections" doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 0 ops_to_perform = ["create", "update", "read", "replace", "delete"] if self.subdoc_test: ops_to_perform = ["insert", "upsert", "remove"] for op_type in ops_to_perform: self.log.info("Performing '%s' with timeout=%s" % (op_type, self.sdk_timeout)) curr_spec = deepcopy(doc_load_spec) if op_type == "create": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "update": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "delete": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "read": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 5 curr_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.TimeoutException ] elif op_type == "insert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 5 elif op_type == "upsert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 5 elif op_type == "remove": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 5 doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, curr_spec, mutation_num=1, async_load=True, validate_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(10, "Wait before reverting the error condition") # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.fail("Doc_loading for '%s' failed" % op_type) # Fetch latest stats and validate the values are updated for node in target_nodes: curr_stat = cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == curr_stat: self.log_failure("vbucket_seqno not updated. %s == %s" % (vb_info["init"][node.ip], curr_stat)) # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local method to validate vb_seqno def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for vb_id in range(self.cluster.vbuckets): vb_id = str(vb_id) if vb_id not in affected_vbs: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log_failure( "Unaffected vb-%s stat updated: %s != %s" % (vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.ACTIVE]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log.warning( err_msg % (node.ip, Bucket.vBucket.ACTIVE, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.REPLICA]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ == vb_info["post_timeout"][node.ip][vb_id]: retry_validation = True self.log.warning( err_msg % (node.ip, Bucket.vBucket.REPLICA, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets[Bucket.vBucket.ACTIVE] = list() target_nodes_vbuckets[Bucket.vBucket.REPLICA] = list() vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) target_nodes_vbuckets[Bucket.vBucket.ACTIVE] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.ACTIVE) target_nodes_vbuckets[Bucket.vBucket.REPLICA] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.REPLICA) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout target_vbs = target_nodes_vbuckets[Bucket.vBucket.ACTIVE] if self.nodes_init == 1: pass elif self.durability_level \ == Bucket.DurabilityLevel.PERSIST_TO_MAJORITY: target_vbs = target_nodes_vbuckets[Bucket.vBucket.REPLICA] # Create required doc_generators doc_gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.crud_batch_size, target_vbucket=target_vbs, key_size=self.key_size) doc_gen["remove"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=2, target_vbucket=target_vbs) doc_gen["read"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=0, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, int(self.num_items / 4), self.crud_batch_size, key_size=self.key_size, template_index=1, target_vbucket=target_vbs) for op_type in doc_gen.keys(): tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) for op_type in doc_gen.keys(): self.task_manager.add_new_task(tasks[op_type]) # Wait for document_loader tasks to complete for op_type in doc_gen.keys(): self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == DocLoading.Bucket.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in doc_gen.keys(): if op_type == DocLoading.Bucket.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets))) affected_vbs = list(set(affected_vbs)) err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s" # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # If replicas+1 == total nodes, verify no mutation should have # succeeded with durability if self.nodes_init == self.num_replicas + 1: read_gen = doc_generator(self.key, 0, self.num_items) read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, read_gen, DocLoading.Bucket.DocOps.READ, 0, batch_size=500, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) failed_keys = TableView(self.log.error) failed_keys.set_headers(["Key", "Error"]) half_of_num_items = self.num_items / 2 for doc_key, doc_info in read_task.success.items(): key_index = int(doc_key.split("-")[1]) expected_mutated_val = 0 if key_index < half_of_num_items: expected_mutated_val = 1 mutated = json.loads(str(doc_info["value"]))["mutated"] if mutated != expected_mutated_val: failed_keys.add_row([doc_key, doc_info]) failed_keys.display("Affected mutations:") self.log.error(read_task.fail) # Doc error validation for op_type in doc_gen.keys(): task = tasks[op_type] retry_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(retry_task) retry_failures = set(retry_task.fail.keys()) initial_failures = set(task.fail.keys()) if len(list(retry_failures.difference(initial_failures))) != 0: self.log_failure("Docs failed during retry task for %s: %s" % (op_type, retry_task.fail)) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local methods to validate vb_seqno def compare_vb_stat(stat_1, stat_2, vb, comparison="!="): keys_to_check = ["high_seqno", "high_completed_seqno"] result = True for key in keys_to_check: if vb in stat_1.keys(): if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]: self.log_failure( "Mismatch in vb-%s UUID. %s != %s" % (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"])) if comparison == "!=": if stat_1[vb][key] != stat_2[vb][key]: result = False self.log.warning( "Mismatch in vb-%s stat %s. %s != %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) elif stat_1[vb][key] == stat_2[vb][key]: result = False self.log.warning( "Stat not updated for vb-%s stat %s. " "%s == %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) return result def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for tem_vb_num in range(self.cluster_util.vbuckets): tem_vb_num = str(tem_vb_num) if tem_vb_num not in affected_vbs: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log_failure("Unaffected vb-%s stat" % tem_vb_num) elif int(tem_vb_num) in target_nodes_vbuckets["active"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "active", tem_vb_num)) elif int(tem_vb_num) in target_nodes_vbuckets["replica"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num, comparison="==") is False: retry_validation = True self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "replica", tem_vb_num)) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 self.key = "test_collections" self.sdk_timeout = 3 # Select target vbucket type to load_docs target_vb_type = "replica" if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \ and self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" # Create required scope/collection for successful CRUD operation if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.collection_name = self.bucket_util.get_random_name() self.log.info("Creating scope::collection %s::%s" % (self.scope_name, self.collection_name)) self.create_scope_collection() # Load docs into created collection self.log.info("Loading data into created collection") load_gen = doc_generator(self.key, 0, self.num_items) task = self.task.async_load_gen_docs( self.cluster, self.bucket, load_gen, "create", 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=200, process_concurrency=8, timeout_secs=60) self.task_manager.get_task_result(task) if self.subdoc_test: load_gen = sub_doc_generator(self.key, 0, self.num_items / 2) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, load_gen, Bucket_Op.SubDocOps.INSERT, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, path_create=True, batch_size=100, process_concurrency=8, durability=self.durability_level, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(task) self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items = self.num_items target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout if target_vb_type == "active": target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["replica"]))) else: target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["active"]))) # Create required doc_generators doc_gen["create"] = doc_generator(self.key, self.num_items, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["delete"] = doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["read"] = doc_generator(self.key, int(self.num_items / 3), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["update"] = doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) # Create required subdoc generators doc_gen["insert"] = sub_doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, template_index=1, target_vbucket=target_vbs) doc_gen["remove"] = sub_doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error_simulation to take effect") ops_to_perform = [ Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE, Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE ] if self.subdoc_test: ops_to_perform = [ Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT, Bucket_Op.SubDocOps.REMOVE ] for op_type in ops_to_perform: self.log.info("Starting doc op %s" % op_type) if op_type in Bucket_Op.DOC_OPS: tasks[op_type] = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, suppress_error_table=True, print_ops_rate=False, skip_read_on_error=True) else: tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, path_create=True, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False) self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == Bucket_Op.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in ops_to_perform: if op_type == Bucket_Op.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets))) affected_vbs = list(set(affected_vbs)) # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # Get SDK Client from client_pool sdk_client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) # Doc error validation for op_type in ops_to_perform: task = tasks[op_type] if self.nodes_init == 1 \ and op_type != Bucket_Op.DocOps.READ \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.info) table_view.set_headers(["Key", "vBucket", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if SDKException.DurabilityAmbiguousException \ not in str(doc_info["error"]): table_view.add_row( [doc_key, vb_for_key, doc_info["error"]]) ambiguous_table_view.add_row([doc_key, str(vb_for_key)]) if op_type not in Bucket_Op.SUB_DOC_OPS: retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("D_Ambiguous exception during %s" % op_type) # Release the acquired client self.sdk_client_pool.release_client(sdk_client) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_sync_write_in_progress(self): """ Test to simulate sync_write_in_progress error and validate the behavior This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select nodes to simulate the error which will affect the durability 2. Enable the specified error_scenario on the selected nodes 3. Perform individual CRUDs and verify sync_write_in_progress errors 4. Validate the end results """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() replica_vbs = dict() vb_info["init"] = dict() # Variable to hold one of the doc_generator objects gen_loader = [None, None] doc_loader_task_1 = None doc_loader_task_2 = None # Override the crud_batch_size self.crud_batch_size = 5 expected_failed_doc_num = self.crud_batch_size # Select nodes to affect and open required shell_connections target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbuckets = replica_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(replica_vbs[target_nodes[index].ip]))) index += 1 # Initialize doc_generators to use for testing self.log.info("Creating doc_generators") gen_create = doc_generator(self.key, self.num_items, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_update_delete = doc_generator(self.key, 0, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets, mutate=1) gen_subdoc = sub_doc_generator(self.key, 0, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) self.log.info("Done creating doc_generators") inital_num_items = self.num_items # Start CRUD operation based on the given 'doc_op' type if self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: self.num_items += self.crud_batch_size gen_loader[0] = gen_create elif self.doc_ops[0] in DocLoading.Bucket.DocOps.UPDATE: gen_loader[0] = gen_update_delete elif self.doc_ops[0] == DocLoading.Bucket.DocOps.DELETE: gen_loader[0] = gen_update_delete self.num_items -= self.crud_batch_size elif self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: gen_loader[0] = gen_subdoc if self.doc_ops[1] == DocLoading.Bucket.DocOps.CREATE: gen_loader[1] = gen_create elif self.doc_ops[1] in [ DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: gen_loader[1] = gen_update_delete elif self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: if self.doc_ops[1] == DocLoading.Bucket.SubDocOps.INSERT \ and self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: gen_subdoc = sub_doc_generator(self.key, inital_num_items, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_loader[1] = gen_subdoc gen_loader[1] = gen_subdoc # Load task for further upsert / remove operations if (self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]) or (self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]): subdoc_load_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc, DocLoading.Bucket.SubDocOps.INSERT, path_create=True, batch_size=self.crud_batch_size, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(subdoc_load_task) tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = "NONE" # Initialize tasks and store the task objects if self.doc_ops[0] in [ DocLoading.Bucket.DocOps.CREATE, DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: doc_loader_task_1 = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[0], self.doc_ops[0], 0, batch_size=1, process_concurrency=self.crud_batch_size, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, start_task=False) elif self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: doc_loader_task_1 = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_loader[0], self.doc_ops[0], 0, path_create=True, batch_size=1, process_concurrency=self.crud_batch_size, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, start_task=False) # This will support both sync-write and non-sync-writes if self.doc_ops[1] in [ DocLoading.Bucket.DocOps.CREATE, DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: doc_loader_task_2 = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[1], self.doc_ops[1], 0, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=5, task_identifier="parallel_task2", print_ops_rate=False, start_task=False) elif self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: doc_loader_task_2 = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_loader[1], self.doc_ops[1], 0, path_create=True, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=5, task_identifier="parallel_task2", print_ops_rate=False, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error simulation to take effect") # Start the loader_task_1 self.task_manager.add_new_task(doc_loader_task_1) self.sleep(10, "Wait for task_1 CRUDs to reach server") # Start the loader_task_2 self.task_manager.add_new_task(doc_loader_task_2) # This task should be done will all sync_write_in_progress errors self.task.jython_task_manager.get_task_result(doc_loader_task_2) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loader_task_1 to complete self.task.jython_task_manager.get_task_result(doc_loader_task_1) # Validation to verify the sync_in_write_errors in doc_loader_task_2 failed_docs = doc_loader_task_2.fail if len(failed_docs.keys()) != expected_failed_doc_num: self.log_failure( "Exception not seen for few docs: {0}".format(failed_docs)) expected_exception = SDKException.AmbiguousTimeoutException retry_reason = SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if self.doc_ops[0] in DocLoading.Bucket.DocOps.CREATE: expected_exception = SDKException.DocumentNotFoundException retry_reason = None valid_exception = self.durability_helper.validate_durability_exception( failed_docs, expected_exception, retry_reason=retry_reason) if not valid_exception: self.log_failure("Got invalid exception") # Validate docs for update success or not if self.doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE: read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[0], DocLoading.Bucket.DocOps.READ, batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 \ and json.loads(str(doc_info["value"]))["mutated"] != 1: self.log_failure("Update failed for key %s: %s" % (key, doc_info)) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.validate_test_failure()
def test_with_persistence_issues(self): """ Test to make sure timeout is handled in durability calls and document CRUDs are successful even with disk related failures 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout value is considered as 'seconds' """ if self.durability_level in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim = DiskError(self.log, self.task_manager, self.cluster.master, target_nodes, 60, 0, False, 120, disk_location="/data") error_sim.create(action=self.simulate_error) else: for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform CRUDs with induced error scenario is active load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim.revert(self.simulate_error) else: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats got updated" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_process_error_on_nodes(self): """ Test to validate OoO returns feature 1. Start parallel CRUDs using single client 2. Perform process crash / stop with doc_ops in parallel 3. Make sure no crash or ep_eng issue is seen with the err_simulation """ tasks = list() node_data = dict() bucket = self.bucket_util.buckets[0] revert_errors = [ CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_SERVER, CouchbaseError.STOP_BEAMSMP, CouchbaseError.STOP_PERSISTENCE ] # Overriding sdk_timeout to max self.sdk_timeout = 60 # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Can take 'all_nodes' / 'single node' crash_on = self.input.param("crash_on", "single_node") error_to_simulate = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) num_times_to_affect = self.input.param("times_to_affect", 20) nodes_to_affect = self.cluster_util.get_kv_nodes() if crash_on == "single_node": nodes_to_affect = [choice(nodes_to_affect)] create_gen = doc_generator(self.key, self.num_items, self.num_items * 2) update_gen = doc_generator(self.key, 0, self.num_items / 2) delete_gen = doc_generator(self.key, self.num_items / 2, self.num_items) for node in nodes_to_affect: shell = RemoteMachineShellConnection(node) node_data[node] = dict() node_data[node]["cb_err"] = CouchbaseError(self.log, shell) self.log.info("Starting doc-ops") for doc_op in self.doc_ops: load_gen = update_gen if doc_op == DocLoading.Bucket.DocOps.CREATE: load_gen = create_gen elif doc_op == DocLoading.Bucket.DocOps.DELETE: load_gen = delete_gen task = self.task.async_load_gen_docs( self.cluster, bucket, load_gen, doc_op, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool, batch_size=10, process_concurrency=1, skip_read_on_error=True, print_ops_rate=False) tasks.append(task) self.log.info("Starting error_simulation on %s" % nodes_to_affect) for itr in range(1, num_times_to_affect + 1): self.log.info("Iteration :: %d" % itr) for node in nodes_to_affect: node_data[node]["cb_err"].create(error_to_simulate, bucket.name) if error_to_simulate in revert_errors: self.sleep(30, "Sleep before reverting the error") for node in nodes_to_affect: node_data[node]["cb_err"].revert(error_to_simulate, bucket.name) else: self.sleep(10, "Wait for process to come back online") # Wait for doc_ops to complete for task in tasks: self.task_manager.get_task_result(task)
def test_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 (Positive case) self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["target_vbuckets"] = list( set(range(0, 1024)) ^ set(active_vbs_in_target_nodes)) self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud() # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with process crash") if self.simulate_error \ not in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # In case of error with Ephemeral bucket, need to rebalance # to make sure data is redistributed properly if self.bucket_type == Bucket.Type.EPHEMERAL: retry_num = 0 result = None while retry_num != 2: result = self.task.rebalance( self.servers[0:self.nodes_init], [], []) if result: break retry_num += 1 self.sleep(10, "Wait before retrying rebalance") self.assertTrue(result, "Rebalance failed") # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover stat validation if self.simulate_error == CouchbaseError.KILL_MEMCACHED: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: if self.simulate_error != CouchbaseError.STOP_MEMCACHED \ and self.bucket_type == Bucket.Type.EPHEMERAL: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: val = failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats mismatch after error condition:" \ " %s != %s" \ % (failover_info["init"][node.ip], failover_info["afterCrud"][node.ip]) self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Doc count validation self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = CrashTest.getVbucketNumbers( remote, self.bucket.name, self.target_node) bucket_dict = BucketUtils.get_random_collections( self.cluster.buckets, req_num=1, consider_scopes="all", consider_buckets="all") bucket = BucketUtils.get_bucket_obj(self.cluster.buckets, bucket_dict.keys()[0]) scope_name = bucket_dict[bucket.name]["scopes"].keys()[0] collection_name = bucket_dict[bucket.name][ "scopes"][scope_name]["collections"].keys()[0] scope = BucketUtils.get_scope_obj( bucket, scope_name) collection = BucketUtils.get_collection_obj(scope, collection_name) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return self.start_doc_loading_tasks(target_vbuckets, scope_name, collection) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(self.doc_loading_task) if self.atomicity: self.task.jython_task_manager.get_task_result( self.transaction_load_task) elif self.N1qltxn: self.task.jython_task_manager.get_task_result( self.N1ql_load_task) if len(self.doc_loading_task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % self.doc_loading_task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( self.doc_loading_task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Get SDK client for CRUD retries sdk_client = self.sdk_client_pool.get_client_for_bucket(self.bucket) for doc_key, crud_result in self.doc_loading_task.fail.items(): result = sdk_client.crud(DocLoading.Bucket.DocOps.CREATE, doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection self.sdk_client_pool.release_client(sdk_client) self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets) # Update self.num_items and validate docs per collection if not self.N1qltxn and self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
def test_sub_doc_with_persistence_issues(self): """ 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations met the durability condition """ if self.durability_level.upper() in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] load_spec = dict() load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 20 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 10 self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) # Create new docs for sub-doc operations to run self.load_data_for_sub_doc_ops() self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform CRUDs with induced error scenario is active self.log.info("Perform 'insert', 'upsert', 'remove' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=0, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=1) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] self.assertTrue(val, msg="Failover stats not updated") # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()