def test_book_flight(self): num_tickets_to_book = 3 flight_ids = self.__get_flight_ids("American Airlines") endpoint = "http://%s/%s" % (self.profile_endpoint, "confirmBooking") table = TableView(self.log.info) table.set_headers(["Username", "Booking ID", "Status", "Class", "Num Seats"]) for index in range(1, num_tickets_to_book): username = self.username_format % index booking_data = { "username": username, "password": self.password, "flightId": choice(flight_ids), "flightSeats": choice(range(1, 3)), "bookingClass": "economy", "bankAccount": hashlib.md5( username.encode('utf-8')).hexdigest() } response = RestHelper.post_request(endpoint, booking_data) if response.status_code != 200: self.log.error("Request returned code %s: %s" % (response.status_code, response.json())) self.fail("Booking failed") response = response.json()["Msg"] table.add_row([username, response["id"], response["status"], response["bookingClass"], response["flightSeats"]]) table.display("Booking details:")
def __get_purged_tombstone_from_last_run(self, nodes=None): """ :return last_purged_tombstones: Dict of format, { node_ip: {'count': N, 'keys': [k1, k2, ..] }, ...} """ tail_cmd = "cat %s/var/lib/couchbase/logs/debug.log " \ % self.couchbase_base_dir \ + "| sed -n '/%s/,$p'" purged_ts_count_pattern = ".*Purged ([0-9]+) ns_config tombstone" meta_kv_keys_pattern = ".*{metakv,[ ]*<<\"/([0-9a-zA-Z_\-\.]+)\">>" start_of_line = "^\[ns_server:" start_of_line = re.compile(start_of_line) meta_kv_keys_pattern = re.compile(meta_kv_keys_pattern) purged_ts_count_pattern = re.compile(purged_ts_count_pattern) tbl_view = TableView(self.log.info) tbl_view.set_headers(["Node", "Purged Keys"]) last_purged_tombstones = dict() if nodes is None: nodes = self.cluster_util.get_nodes_in_cluster(self.cluster) for node in nodes: self.log.info("Processing debug logs from %s" % node.ip) shell = RemoteMachineShellConnection(node) output, _ = shell.execute_command( tail_cmd % self.ts_during_start[node.ip]) if not output: output, _ = shell.execute_command( " ".join(tail_cmd.split(' ')[:2])) self.log.debug("Tail stdout:\n%s" % output) o_len = len(output) target_buffer = "" total_ts_purged = 0 for index in range(o_len-1, -1, -1): line = output[index] if not start_of_line.match(line): target_buffer = line + target_buffer elif "tombstone_agent:purge:" in line: total_ts_purged = \ purged_ts_count_pattern.match(line).group(1) break else: target_buffer = "" last_purged_tombstones[node.ip] = dict() last_purged_tombstones[node.ip]["count"] = int(total_ts_purged) last_purged_tombstones[node.ip]["keys"] = \ meta_kv_keys_pattern.findall(target_buffer) tbl_view.add_row([node.ip, total_ts_purged]) shell.disconnect() tbl_view.display("Purged_keys:") self.log.debug("Purged keys :: %s" % last_purged_tombstones) return last_purged_tombstones
def test_cancel_booking(self): endpoint = "http://%s/%s" % (self.profile_endpoint, "allBookings") booked_tickets = list() for index in range(1, 11): username = self.username_format % index auth_data = {"username": username, "password": self.password} response = RestHelper.post_request(endpoint, auth_data) if response.status_code != 200: self.log.error("Request returned code %s: %s" % (response.status_code, response.json())) self.fail("Fetching booking history failed") bookings = response.json()["Msg"][0]["bookings"] if len(bookings) > 0: booked_tickets.append([username, bookings]) target_user = choice(booked_tickets) booking_id = choice(target_user[1]) self.log.info("Cancel %s for user %s" % (booking_id, target_user[0])) endpoint = "http://%s/%s" % (self.profile_endpoint, "cancelBooking") data = {"username": target_user[0], "password": self.password, "id": booking_id} response = RestHelper.post_request(endpoint, data) if response.status_code != 200: self.log.error("Request returned code %s: %s" % (response.status_code, response.json())) self.fail("Fetching booking history failed") # Fetch booking status to confirm cancellation endpoint = "http://%s/%s" % (self.profile_endpoint, "getBooking") data = {"username": target_user[0], "password": self.password, "id": booking_id} response = RestHelper.post_request(endpoint, data) if response.status_code != 200: self.log.error("Request returned code %s: %s" % (response.status_code, response.json())) self.fail("Fetching booking history failed") response = response.json()["Msg"] table = TableView(self.log.info) table.add_row(["Booking ID", response["id"]]) table.add_row(["Flight", response["flightId"]]) table.add_row(["Status", response["status"]]) table.add_row(["Seats", "%s (%s)" % (response["flightSeats"], ", ".join(response["TicketsBooked"]))]) table.add_row(["Class", response["bookingClass"]]) table.display("Ticket status:") self.assertEqual(response["status"], "Booking Cancelled")
def __get_deleted_key_count(self, check_if_zero=False): deleted_keys = self.cluster_util.get_ns_config_deleted_keys_count() tbl = TableView(self.log.info) tbl.set_headers(["Node", "Deleted_key_count"]) for t_ip, k_count in deleted_keys.items(): tbl.add_row(["%s" % t_ip, "%s" % k_count]) tbl.display("Tombstone count on cluster nodes:") if not check_if_zero: return for t_ip, k_count in deleted_keys.items(): if k_count != 0: self.fail("%s Deleted key count %s != 0" % (t_ip, k_count))
def print_spec_details(self, spec, cycles, elapsed_time): table = TableView(self.log.info) table.set_headers(["Operation", "Value"]) table.add_row([ "Collections dropped and recreated", str(spec[MetaCrudParams.COLLECTIONS_TO_RECREATE]) ]) table.add_row([ "Scopes dropped and recreated", str(spec[MetaCrudParams.SCOPES_TO_RECREATE]) ]) table.add_row(["Cycles of data load", str(cycles)]) table.add_row(["Time Elapsed in secs", str(elapsed_time)]) table.display("Data load details")
def test_list_booking_history(self): table = TableView(self.log.info) table.set_headers(["Username", "Num tickets", "IDs"]) num_tickets_to_book = 3 endpoint = "http://%s/%s" % (self.profile_endpoint, "allBookings") for index in range(1, num_tickets_to_book): username = self.username_format % index auth_data = {"username": username, "password": self.password} response = RestHelper.post_request(endpoint, auth_data) if response.status_code != 200: self.log.error("Request returned code %s: %s" % (response.status_code, response.json())) self.fail("Fetching booking history failed") bookings = response.json()["Msg"][0]["bookings"] table.add_row([username, len(bookings), "\n".join(bookings)]) table.display("Booking history:")
def check_replica_eviction(): tbl = TableView(self.log.info) tbl.set_headers([ "Node", "Memory", "WM_Threshold", "Itm_mem", "Meta_mem", "Evictable_mem", "A_rr", "R_rr" ]) while self.test_failure is None and run_eviction_check: tbl.rows = [] for kv_node in node_data.keys(): all_stats = \ node_data[kv_node]["cbstat"].all_stats(bucket.name) bucket_mem = int(all_stats["ep_max_size"]) wm_threshold = \ (float(all_stats["ep_mem_high_wat_percent"]) - float(all_stats["ep_mem_low_wat_percent"]))*100 evictable_mem = \ int(all_stats["vb_replica_itm_memory"]) \ - int(all_stats["vb_replica_meta_data_memory"]) active_rr = int(all_stats["vb_active_perc_mem_resident"]) replica_rr = int(all_stats["vb_replica_perc_mem_resident"]) tbl.add_row([ kv_node.ip, str(bucket_mem), str(wm_threshold), all_stats["vb_replica_itm_memory"], all_stats["vb_replica_meta_data_memory"], str(evictable_mem), str(active_rr), str(replica_rr) ]) if active_rr != 100 \ and evictable_mem > (bucket_mem/wm_threshold): tbl.display("Node memory stats") self.log_failure("%s - Active keys evicted before " "meeting the threshold: %s" % (kv_node.ip, all_stats)) if replica_rr > active_rr: tbl.display("Node memory stats") self.log_failure( "%s: (active_rr) %s < %s (replica_rr)" % (kv_node.ip, active_rr, replica_rr))
def print_cluster_stats(self): table = TableView(self.log.info) table.set_headers([ "Node", "Services", "CPU_utilization", "Mem_total", "Mem_free", "Swap_mem_total", "Swap_mem_used" ]) rest = RestConnection(self.cluster.master) cluster_stat = rest.get_cluster_stats() for cluster_node, node_stats in cluster_stat.items(): row = list() row.append(cluster_node.split(':')[0]) row.append(str(node_stats["services"])) row.append(str(node_stats["cpu_utilization"])) row.append(str(node_stats["mem_total"])) row.append(str(node_stats["mem_free"])) row.append(str(node_stats["swap_mem_total"])) row.append(str(node_stats["swap_mem_used"])) table.add_row(row) table.display("Cluster statistics")
def test_get_flights_for_airline(self): target_airline = "American Airlines" rest_url = "http://%s/%s/%s" % (self.inventory_endpoint, "flights", urllib.parse.quote(target_airline)) response = RestHelper.get_request(rest_url) if response.status_code != 200: raise Exception("Requests status content:{0}".format( response.content)) self.log.info("Flights for airline: %s" % target_airline) table = TableView(self.log.info) table.set_headers([ "Flight Id", "Model", "Departure", "Arrival", "Departure Time", "Status" ]) for f_data in response.json(): f_data = f_data["flights"] table.add_row([ f_data["flight_id"], f_data["model"], f_data["departing_airport"], f_data["arriving_airport"], f_data["departure_date"], f_data["status"] ]) table.display("Flights for airline: %s" % target_airline)
def test_maxttl_with_sync_writes(self): """ 1. Load few docs without TTL 2. Load few docs with TTL set in parallel to #1 3. Validate docs get expiry after the TTL time :return: """ def_bucket = self.cluster.buckets[0] self.maxttl = self.input.param("doc_ttl", self.maxttl) doc_ops_type = self.input.param("doc_ops_type", "sync;sync").split(";") # Create default doc_load options for TTL and non-TTL tasks non_ttl_task_property = dict() ttl_task_property = dict() # Create generators for TTL and non_TTL loading self.log.info("Creating doc_generators") ttl_gen_create = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster.vbuckets) non_ttl_gen_create = doc_generator(self.key, self.num_items, self.num_items * 2, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster.vbuckets) # Set durability levels based on doc_ops_type non_ttl_task_property["op_type"] = "create" ttl_task_property["op_type"] = "create" if doc_ops_type[0] == "sync": non_ttl_task_property["replicate_to"] = 0 non_ttl_task_property["persist_to"] = 0 non_ttl_task_property["durability"] = self.durability_level else: non_ttl_task_property["replicate_to"] = self.replicate_to non_ttl_task_property["persist_to"] = self.persist_to non_ttl_task_property["durability"] = "None" if doc_ops_type[1] == "sync": ttl_task_property["replicate_to"] = 0 ttl_task_property["persist_to"] = 0 ttl_task_property["durability"] = self.durability_level else: ttl_task_property["replicate_to"] = self.replicate_to ttl_task_property["persist_to"] = self.persist_to ttl_task_property["durability"] = "None" self.load_docs_in_parallel(def_bucket, non_ttl_gen_create, ttl_gen_create, non_ttl_task_property, ttl_task_property) # Validate doc_count before expiry of docs self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items * 2) self.sleep(self.maxttl, "Sleep for maxTTL time") self.bucket_util._expiry_pager(self.cluster) self.sleep(25, "Waiting for items to be purged") # Read all expired docs to validate EONENT status ttl_task = self.task.async_load_gen_docs( self.cluster, def_bucket, ttl_gen_create, "read", self.maxttl, batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(ttl_task) # Max-TTL doc expiry validation self.log.info("Validating expiry of docs") if len(ttl_task.success.keys()) != 0: self.fail("Items present after MaxTTL time: %s" % ttl_task.success.keys()) invalid_exception_tbl = TableView(self.log.info) invalid_exception_tbl.set_headers(["Doc_Key", "CAS"]) for doc_key, result in ttl_task.fail.items(): if result["cas"] != 0 and result["error"] is not None: invalid_exception_tbl.add_row([doc_key, result["cas"]]) invalid_exception_tbl.display("Invalid exceptions for following keys") if len(invalid_exception_tbl.rows) != 0: self.fail("Seen invalid document exception") # Validate doc_count after doc_expiry self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Document mutations after doc_expiry non_ttl_task_property["op_type"] = "update" self.load_docs_in_parallel(def_bucket, non_ttl_gen_create, ttl_gen_create, non_ttl_task_property, ttl_task_property) # Validate doc_count before expiry of docs self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items * 2)
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local method to validate vb_seqno def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for vb_num in range(self.vbuckets): vb_num = str(vb_num) if vb_num not in affected_vbs: if vb_info["init"][node.ip][vb_num] \ != vb_info["post_timeout"][node.ip][vb_num]: self.log_failure( "Unaffected vb-%s stat updated: %s != %s" % (vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) elif int(vb_num) in target_nodes_vbuckets["active"]: if vb_info["init"][node.ip][vb_num] \ != vb_info["post_timeout"][node.ip][vb_num]: self.log_failure( err_msg % (node.ip, "active", vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) elif int(vb_num) in target_nodes_vbuckets["replica"]: if vb_info["init"][node.ip][vb_num] \ == vb_info["post_timeout"][node.ip][vb_num]: retry_validation = True self.log.warning( err_msg % (node.ip, "replica", vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 timeout_err_str = self.durability_helper.EXCEPTIONS["request_timeout"] ambiguous_err_str = self.durability_helper.EXCEPTIONS["ambiguous"] # Create required doc_generators doc_gen["insert"] = sub_doc_generator( self.key, self.num_items / 2, self.num_items / 2 + self.crud_batch_size) doc_gen["remove"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, template_index=2) doc_gen["read"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, template_index=0) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, int(self.num_items / 4), int(self.num_items / 4) + self.crud_batch_size, template_index=1) target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout for op_type in doc_gen.keys(): tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) for op_type in doc_gen.keys(): self.task_manager.add_new_task(tasks[op_type]) # Wait for document_loader tasks to complete for op_type in doc_gen.keys(): self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == "read": # Validation for read task for doc_id, crud_result in tasks[op_type].success.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets) if vb_num in target_nodes_vbuckets["active"]: self.log_failure("Read succeeded for %s present in " "stopped active vbucket: %s" % (doc_id, vb_num)) self.durability_helper.validate_durability_exception( tasks[op_type].fail, self.durability_helper.EXCEPTIONS["request_timeout"]) else: # Validation of CRUDs - Update / Create / Delete if len(tasks[op_type].success.keys()) != 0: self.log_failure("Few keys succeeded for %s: %s" % (op_type, tasks[op_type].success.keys())) for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets) if vb_num in target_nodes_vbuckets["active"]: if timeout_err_str not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) else: if ambiguous_err_str not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in doc_gen.keys(): if op_type == "read": continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets))) affected_vbs = list(set(affected_vbs)) err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s" # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # If replicas+1 == total nodes, verify no mutation should have # succeeded with durability if self.nodes_init == self.num_replicas + 1: read_gen = doc_generator(self.key, 0, self.num_items) read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, read_gen, "read", 0, batch_size=500, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) failed_keys = TableView(self.log.error) failed_keys.set_headers(["Key", "Error"]) for doc_key, doc_info in read_task.success.items(): mutated = json.loads(str(doc_info["value"]))["mutated"] if mutated != 0: failed_keys.add_row([doc_key, doc_info]) failed_keys.display("Affected mutations:") self.log.error(read_task.fail) # SDK client for retrying AMBIGUOUS for unexpected keys sdk_client = SDKClient(RestConnection(self.cluster.master), self.bucket) # Doc error validation for op_type in doc_gen.keys(): task = tasks[op_type] if self.nodes_init == 1 \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.error) table_view.set_headers(["Key", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if vb_for_key in target_nodes_vbuckets["active"]: expected_exception = \ self.durability_helper.EXCEPTIONS["request_timeout"] elif vb_for_key in target_nodes_vbuckets["replica"]: expected_exception = \ self.durability_helper.EXCEPTIONS["ambiguous"] else: expected_exception = \ self.durability_helper.EXCEPTIONS["ambiguous"] ambiguous_table_view.add_row([doc_key, vb_for_key]) retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) if expected_exception not in str(doc_info["error"]): table_view.add_row([doc_key, doc_info["error"]]) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("Ambiguous exception during %s" % op_type) # Close the SDK connection sdk_client.close() # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Retry the same CRUDs after reverting the failure environment tasks = list() for op_type in doc_gen.keys(): tasks.append( self.task.async_load_gen_docs(self.cluster, self.bucket, doc_gen[op_type], op_type, 0, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Wait for document_loader tasks to complete for task in tasks: self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure( "Failures with no error condition: {0}, {1}".format( task.fail, task.fail.keys())) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
class volume(BaseTestCase): def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket": False}) BaseTestCase.setUp(self) self.rest = RestConnection(self.servers[0]) self.op_type = self.input.param("op_type", "create") self.available_servers = list() self.available_servers = self.cluster.servers[self.nodes_init:] self.num_buckets = self.input.param("num_buckets", 1) self.mutate = 0 self.doc_ops = self.input.param("doc_ops", None) if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.iterations = self.input.param("iterations", 2) self.vbucket_check = self.input.param("vbucket_check", True) self.new_num_writer_threads = self.input.param( "new_num_writer_threads", 6) self.new_num_reader_threads = self.input.param( "new_num_reader_threads", 8) self.create_perc = 100 self.update_perc = self.input.param("update_perc", 50) self.delete_perc = self.input.param("delete_perc", 50) self.expiry_perc = self.input.param("expiry_perc", 0) self.start = 0 self.end = 0 self.initial_items = self.start self.final_items = self.end self.create_end = 0 self.create_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.num_collections = self.input.param("num_collections", 10) def create_required_buckets(self): self.log.info("Get the available memory quota") self.info = self.rest.get_nodes_self() threshold_memory = 100 # threshold_memory_vagrant = 100 total_memory_in_mb = self.info.mcdMemoryReserved total_available_memory_in_mb = total_memory_in_mb # If the mentioned service is already present, # we remove that much memory from available memory quota if "index" in self.info.services: total_available_memory_in_mb -= self.info.indexMemoryQuota if "fts" in self.info.services: total_available_memory_in_mb -= self.info.ftsMemoryQuota if "cbas" in self.info.services: total_available_memory_in_mb -= self.info.cbasMemoryQuota if "eventing" in self.info.services: total_available_memory_in_mb -= self.info.eventingMemoryQuota available_memory = total_available_memory_in_mb - threshold_memory self.rest.set_service_memoryQuota(service='memoryQuota', memoryQuota=available_memory) # Creating buckets for data loading purpose self.log.info("Create CB buckets") self.bucket_expiry = self.input.param("bucket_expiry", 0) ramQuota = self.input.param("ramQuota", available_memory) buckets = self.input.param("bucket_names", "GleamBookUsers").split(';') self.bucket_type = self.bucket_type.split(';') self.compression_mode = self.compression_mode.split(';') self.bucket_eviction_policy = self.bucket_eviction_policy for i in range(self.num_buckets): bucket = Bucket({ Bucket.name: buckets[i], Bucket.ramQuotaMB: ramQuota / self.num_buckets, Bucket.maxTTL: self.bucket_expiry, Bucket.replicaNumber: self.num_replicas, Bucket.storageBackend: self.bucket_storage, Bucket.evictionPolicy: self.bucket_eviction_policy, Bucket.bucketType: self.bucket_type[i], Bucket.compressionMode: self.compression_mode[i] }) self.bucket_util.create_bucket(bucket) # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) self.rest.monitorRebalance() return bucket def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default"): for node in self.cluster_util.get_kv_nodes(): bucket_helper = BucketHelper(node) bucket_helper.update_memcached_settings( num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads) def generate_docs(self, doc_ops=None): self.gen_delete = None self.gen_create = None self.gen_update = None self.gen_expiry = None self.create_end = 0 self.create_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.initial_items = self.final_items if doc_ops is None: doc_ops = self.doc_ops if "update" in doc_ops: self.update_start = 0 self.update_end = self.num_items * self.update_perc / 100 self.mutate += 1 self.gen_update = doc_generator( "Users", self.update_start, self.update_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size, mutate=self.mutate) if "delete" in doc_ops: self.delete_start = self.start self.delete_end = self.start + (self.num_items * self.delete_perc) / 100 self.gen_delete = doc_generator( "Users", self.delete_start, self.delete_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items -= (self.delete_end - self.delete_start) * self.num_collections if "expiry" in doc_ops and self.maxttl: self.expire_start = self.start + (self.num_items * self.delete_perc) / 100 self.expire_end = self.start + self.num_items * ( self.delete_perc + self.expiry_perc) / 100 self.gen_expiry = doc_generator( "Users", self.expire_start, self.expire_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items -= (self.expire_end - self.expire_start) * self.num_collections if "create" in doc_ops: self.start = self.end self.end += self.num_items * self.create_perc / 100 self.create_start = self.start self.create_end = self.end self.gen_create = doc_generator( "Users", self.start, self.end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items += (self.end - self.start) * self.num_collections def doc_loader(self, op_type, kv_gen, exp=0, scope=None, collection=None): if scope is None: scope = CbServer.default_scope if collection is None: collection = CbServer.default_collection retry_exceptions = [ SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException ] tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, kv_gen, op_type, exp, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, scope=scope, collection=collection) return tasks_info def data_load(self, scope=CbServer.default_scope, collections=[CbServer.default_scope]): tasks_info = dict() for collection in collections: if self.gen_update is not None: task_info = self.doc_loader("update", self.gen_update, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_create is not None: task_info = self.doc_loader("create", self.gen_create, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_delete is not None: task_info = self.doc_loader("delete", self.gen_delete, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_expiry is not None and self.maxttl: task_info = self.doc_loader("update", self.gen_expiry, self.maxttl, scope=scope, collection=collection) tasks_info.update(task_info.items()) return tasks_info def data_validation(self, tasks_info, scope=CbServer.default_scope, collections=[CbServer.default_scope], check_docs=True): for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions(tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) for task, task_info in tasks_info.items(): self.assertFalse( task_info["ops_failed"], "Doc ops failed for task: {}".format(task.thread_name)) if check_docs: self.log.info("Validating Active/Replica Docs") self.check_replica = False for bucket in self.bucket_util.buckets: tasks = list() for collection in collections: if self.gen_update is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_create is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_delete is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_delete, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_expiry is not None: self.sleep( self.maxttl, "Wait for docs to expire until expiry time..") tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_expiry, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) for task in tasks: self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() # self.bucket_util.verify_stats_all_buckets(self.final_items) def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) # Stopping and restarting the memcached process def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() def rebalance(self, nodes_in=0, nodes_out=0): servs_in = random.sample(self.available_servers, nodes_in) self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, nodes_out) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], servs_in, servs_out, check_vbucket_shuffling=self.vbucket_check, retry_get_process_num=150) self.available_servers = [ servs for servs in self.available_servers if servs not in servs_in ] self.available_servers += servs_out self.cluster.nodes_in_cluster.extend(servs_in) self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) return rebalance_task def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers([ "Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted", "Items Expired" ]) self.table.add_row([ str(self.initial_items), str(self.final_items), str(self.update_start) + "-" + str(self.update_end), str(self.create_start) + "-" + str(self.create_end), str(self.delete_start) + "-" + str(self.delete_end), str(self.expire_start) + "-" + str(self.expire_end) ]) self.table.display("Docs statistics") def Volume(self): ####################################################################### self.log.info("Step1: Create a n node cluster") if self.nodes_init > 1: nodes_init = self.cluster.servers[1:self.nodes_init] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) ####################################################################### self.log.info("Step 2 & 3: Create required buckets.") self.bucket = self.create_required_buckets() self.loop = 0 scope_name = "VolumeScope" collection_prefix = "VolumeCollection" self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": scope_name}) for i in range(self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection '%s::%s'" % (scope_name, collection_name)) self.bucket_util.create_collection(self.cluster.master, self.bucket, scope_name, {"name": collection_name}) self.sleep(2) ####################################################################### while self.loop < self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.bucket_util.add_rbac_user() self.generate_docs(doc_ops="create") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) for task in tasks_info: self.task.jython_task_manager.get_task_result(task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) self.create_perc = self.input.param("create_perc", 100) ################################################################### self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs(doc_ops="create") self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.stop_process() self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info( "Step 11: Failover a node and RebalanceOut that node \ with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() self.nodes = self.rest.node_statuses() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) self.task.jython_task_manager.get_task_result(rebalance_task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 12: Failover a node and FullRecovery\ that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 13: Failover a node and DeltaRecovery that \ node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 15: Flush the bucket and \ start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: nodes_cluster = self.cluster.nodes_in_cluster[:] nodes_cluster.remove(self.cluster.master) servs_out = random.sample( nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) self.task.jython_task_manager.get_task_result( rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.get_bucket_dgm(self.bucket) else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(self.bucket) def SteadyStateVolume(self): ####################################################################### self.log.info("Step 1: Create a n node cluster") if self.nodes_init > 1: nodes_init = self.cluster.servers[1:self.nodes_init] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) ####################################################################### self.log.info("Step 2: Create required buckets.") self.bucket = self.create_required_buckets() self.loop = 0 scope_name = "VolumeScope" collection_prefix = "VolumeCollection" self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": scope_name}) for i in range(self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection '%s::%s'" % (scope_name, collection_name)) self.bucket_util.create_collection(self.cluster.master, self.bucket, scope_name, {"name": collection_name}) self.sleep(2) ####################################################################### self.log.info("Step 3: Per-Requisites for Loading of docs") self.create_perc = 100 _iter = 0 while _iter < 2: self.generate_docs(doc_ops="create") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation(tasks_info, check_docs=False) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1 _iter = 0 self.update_perc = 100 while _iter < 10: self.generate_docs(doc_ops="update") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1 for i in range(1, self.num_collections, 2): collection_name = collection_prefix + str(i) self.bucket_util.drop_collection(self.cluster.master, self.bucket, scope_name, collection_name) self.bucket.scopes[scope_name].collections.pop(collection_name) self.update_perc = self.input.param("update_perc", 100) self.create_perc = self.input.param("create_perc", 100) _iter = 0 while _iter < 10: self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1
def test_basic_ops(self): """ Basic test for Sub-doc CRUD operations """ doc_op = self.input.param("op_type", None) def_bucket = self.bucket_util.buckets[0] supported_d_levels = self.bucket_util.get_supported_durability_levels() # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += self.num_items # Initial validation failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = sub_doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) self.log.info("Loading {0} docs into the bucket: {1}".format( self.num_items, def_bucket)) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, doc_create, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Update verification_dict and validate verification_dict["ops_update"] += self.num_items if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += self.num_items failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.verify_stats_all_buckets(self.num_items) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) template_index = 0 if doc_op == DocLoading.Bucket.SubDocOps.REMOVE: template_index = 2 sub_doc_gen = sub_doc_generator_for_edit(self.key, start=0, end=num_item_start_for_crud, key_size=self.key_size, template_index=template_index) if doc_op == DocLoading.Bucket.SubDocOps.UPSERT: self.log.info("Performing 'upsert' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += \ (sub_doc_gen.end - sub_doc_gen.start + len(task.fail.keys())) if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = \ sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Update failed key", "Value"]) for key, value in task.success.items(): doc_value = value["value"] failed_row = [key, doc_value] if doc_value[0] != 2: op_failed_tbl.add_row(failed_row) elif doc_value[1] != "LastNameUpdate": op_failed_tbl.add_row(failed_row) elif doc_value[2] != "TypeChange": op_failed_tbl.add_row(failed_row) elif doc_value[3] != "CityUpdate": op_failed_tbl.add_row(failed_row) elif json.loads(str(doc_value[4])) != ["get", "up"]: op_failed_tbl.add_row(failed_row) op_failed_tbl.display("Update failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Update failed for few keys") elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE: self.log.info("Performing 'remove' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += \ (sub_doc_gen.end - sub_doc_gen.start + len(task.fail.keys())) if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = sub_doc_gen.template \ .replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Delete failed key", "Value"]) for key, value in task.success.items(): doc_value = value["value"] failed_row = [key, doc_value] if doc_value[0] != 2: op_failed_tbl.add_row(failed_row) for index in range(1, len(doc_value)): if doc_value[index] != "PATH_NOT_FOUND": op_failed_tbl.add_row(failed_row) for key, value in task.fail.items(): op_failed_tbl.add_row([key, value["value"]]) op_failed_tbl.display("Delete failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Delete failed for few keys") else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Validate verification_dict and validate failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.verify_stats_all_buckets(self.num_items)
def test_basic_ops(self): """ Basic test for Sub-doc CRUD operations A test in which `self.num_items` documents are created. Half of the documents are updated or deleted depending on the supplied `op_type`. """ doc_op = self.input.param("op_type", None) def_bucket = self.cluster.buckets[0] # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += self.num_items # Initial validation failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Insert `self.num_items` documents doc_create = sub_doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster.vbuckets) self.log.info("Loading {0} docs into the bucket: {1}" .format(self.num_items, def_bucket)) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, doc_create, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) # The documents that could not be inserted insert_failures = len(task.fail.keys()) # Update verification_dict and validate verification_dict["ops_update"] += self.num_items - insert_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += self.num_items - insert_failures verification_dict["sync_write_aborted_count"] += insert_failures failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) template_index = 0 if doc_op == DocLoading.Bucket.SubDocOps.REMOVE: template_index = 2 sub_doc_gen = sub_doc_generator_for_edit( self.key, start=0, end=num_item_start_for_crud, key_size=self.key_size, template_index=template_index) if doc_op == DocLoading.Bucket.SubDocOps.UPSERT: self.log.info("Performing 'upsert' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # The documents keys for which the update failed update_failures = len(task.fail.keys()) verification_dict["ops_update"] += \ num_item_start_for_crud - update_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud - update_failures # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = \ sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # A set of expected values following a read operation expected_values = {'StateUpdate', 2, 'LastNameUpdate', 'TypeChange', 'CityUpdate', 'FirstNameUpdate'} op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Update failed key", "Value"]) # If the values of attributes does not match the # expected value, append op to list of failed ops. for key, value in task.success.items(): if expected_values != set(value["value"]): op_failed_tbl.add_row([key, value["value"]]) op_failed_tbl.display("Update failed for keys:") # Expect the non-updated values to match the update failures self.assertEqual(len(op_failed_tbl.rows), update_failures, "") elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE: self.log.info("Performing 'remove' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # The number of documents that could not be removed remove_failures = len(task.fail.keys()) verification_dict["ops_update"] += \ num_item_start_for_crud - remove_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud - remove_failures # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = sub_doc_gen.template \ .replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Delete failed key", "Value"]) # Collect read operations that failed for key, value in task.fail.items(): op_failed_tbl.add_row([key, value["error"]]) op_failed_tbl.display("Delete succeeded for keys:") # Expect the reads to have failed indicating the sub-documents are # no longer accessible. self.assertEqual(len(op_failed_tbl.rows), num_item_start_for_crud, "Delete failed for few keys") else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) # Validate verification_dict and validate failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local methods to validate vb_seqno def compare_vb_stat(stat_1, stat_2, vb, comparison="!="): keys_to_check = ["high_seqno", "high_completed_seqno"] result = True for key in keys_to_check: if vb in stat_1.keys(): if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]: self.log_failure( "Mismatch in vb-%s UUID. %s != %s" % (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"])) if comparison == "!=": if stat_1[vb][key] != stat_2[vb][key]: result = False self.log.warning( "Mismatch in vb-%s stat %s. %s != %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) elif stat_1[vb][key] == stat_2[vb][key]: result = False self.log.warning( "Stat not updated for vb-%s stat %s. " "%s == %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) return result def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for tem_vb_num in range(self.cluster_util.vbuckets): tem_vb_num = str(tem_vb_num) if tem_vb_num not in affected_vbs: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log_failure("Unaffected vb-%s stat" % tem_vb_num) elif int(tem_vb_num) in target_nodes_vbuckets["active"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "active", tem_vb_num)) elif int(tem_vb_num) in target_nodes_vbuckets["replica"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num, comparison="==") is False: retry_validation = True self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "replica", tem_vb_num)) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 self.key = "test_collections" self.sdk_timeout = 3 # Select target vbucket type to load_docs target_vb_type = "replica" if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \ and self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" # Create required scope/collection for successful CRUD operation if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.collection_name = self.bucket_util.get_random_name() self.log.info("Creating scope::collection %s::%s" % (self.scope_name, self.collection_name)) self.create_scope_collection() # Load docs into created collection self.log.info("Loading data into created collection") load_gen = doc_generator(self.key, 0, self.num_items) task = self.task.async_load_gen_docs( self.cluster, self.bucket, load_gen, "create", 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=200, process_concurrency=8, timeout_secs=60) self.task_manager.get_task_result(task) if self.subdoc_test: load_gen = sub_doc_generator(self.key, 0, self.num_items / 2) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, load_gen, Bucket_Op.SubDocOps.INSERT, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, path_create=True, batch_size=100, process_concurrency=8, durability=self.durability_level, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(task) self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items = self.num_items target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout if target_vb_type == "active": target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["replica"]))) else: target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["active"]))) # Create required doc_generators doc_gen["create"] = doc_generator(self.key, self.num_items, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["delete"] = doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["read"] = doc_generator(self.key, int(self.num_items / 3), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["update"] = doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) # Create required subdoc generators doc_gen["insert"] = sub_doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, template_index=1, target_vbucket=target_vbs) doc_gen["remove"] = sub_doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error_simulation to take effect") ops_to_perform = [ Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE, Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE ] if self.subdoc_test: ops_to_perform = [ Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT, Bucket_Op.SubDocOps.REMOVE ] for op_type in ops_to_perform: self.log.info("Starting doc op %s" % op_type) if op_type in Bucket_Op.DOC_OPS: tasks[op_type] = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, suppress_error_table=True, print_ops_rate=False, skip_read_on_error=True) else: tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, path_create=True, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False) self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == Bucket_Op.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in ops_to_perform: if op_type == Bucket_Op.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets))) affected_vbs = list(set(affected_vbs)) # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # Get SDK Client from client_pool sdk_client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) # Doc error validation for op_type in ops_to_perform: task = tasks[op_type] if self.nodes_init == 1 \ and op_type != Bucket_Op.DocOps.READ \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.info) table_view.set_headers(["Key", "vBucket", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if SDKException.DurabilityAmbiguousException \ not in str(doc_info["error"]): table_view.add_row( [doc_key, vb_for_key, doc_info["error"]]) ambiguous_table_view.add_row([doc_key, str(vb_for_key)]) if op_type not in Bucket_Op.SUB_DOC_OPS: retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("D_Ambiguous exception during %s" % op_type) # Release the acquired client self.sdk_client_pool.release_client(sdk_client) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
class ConcurrentFailoverTests(AutoFailoverBaseTest): def setUp(self): super(ConcurrentFailoverTests, self).setUp() self.log_setup_status(self.__class__.__name__, "started", self.setUp.__name__) ####################################################################### # List of params to be used for failover # self.timeout from AutoFailoverBaseTest # self.max_count from AutoFailoverBaseTest # To track the triggered failover events self.fo_events = 0 # failover_order to be used for failover_order_tests # Format: # kv:kv-kv:index_query # * Iteration marked by '-' # * Nodes marked by ':' # * Service within a node denoted by '_' (underscore) # In the above case, # - Loop #0 :: 2 KV nodes will be failed # - Loop #1 :: 1 KV + 1 node running n1ql+index will be failed self.failover_order = \ self.input.param("failover_order", "kv").split("-") self.failover_method = \ self.input.param("failover_method", CouchbaseError.STOP_MEMCACHED) # Failover type determines the order of FO (Auto/Graceful/Hard). # Length of this should match the len(self.failover_order) # Example -> auto-graceful-auto # This expects first set of nodes from failover_order to undergo # AUTO FO followed by GRACEFUL FO of nodes through API and then # followed by AUTO FO of 3rd set of nodes as defined by failover_order self.failover_type = \ self.input.param("failover_type", CbServer.Failover.Type.AUTO).split("-") # End of params to be used for failover ####################################################################### self.load_during_fo = self.input.param("load_during_fo", False) self.log.info("Updating Auto-failover settings") self.rest.update_autofailover_settings(enabled=True, timeout=self.timeout, maxCount=self.max_count) # Find the bucket with least replica to check the Auto-FO possibility self.min_bucket_replica = Bucket.ReplicaNum.THREE for bucket in self.cluster.buckets: if bucket.replicaNumber < self.min_bucket_replica: self.min_bucket_replica = bucket.replicaNumber # Hold the dict of {node_obj_to_fail: failover_type, ...} self.nodes_to_fail = None # To display test execution status self.test_status_tbl = TableView(self.log.critical) self.auto_fo_settings_tbl = TableView(self.log.critical) self.test_status_tbl.set_headers( ["Node", "Services", "Node status", "Failover type"]) self.auto_fo_settings_tbl.set_headers([ "Enabled", "Auto FO count", "Max Events configured", "Auto FO timeout", "Disk Auto FO", "Disk Auto FO timeout" ]) self.validate_failover_settings(True, self.timeout, 0, self.max_count) # Init sdk_client_pool if not initialized before if self.sdk_client_pool is None: self.init_sdk_pool_object() CollectionBase.create_sdk_clients( self.task_manager.number_of_threads, self.cluster.master, self.cluster.buckets, self.sdk_client_pool, self.sdk_compression) # Perform initial collection load self.__load_initial_collection_data() self.log_setup_status(self.__class__.__name__, "complete", self.setUp.__name__) def tearDown(self): self.log_setup_status(self.__class__.__name__, "started", self.tearDown.__name__) # Select KV node as a cluster master to perform tearDown rebalance out self.cluster_util.update_cluster_nodes_service_list(self.cluster) self.cluster.master = self.cluster.kv_nodes[0] self.log.info("Resetting auto-failover settings to default") self.rest.update_autofailover_settings(enabled=True, timeout=120, maxCount=1) self.log_setup_status(self.__class__.__name__, "complete", self.tearDown.__name__) super(ConcurrentFailoverTests, self).tearDown() def __get_collection_load_spec(self, doc_ttl=0): """ Set doc_ttl for loading doc during failover operations """ d_level = Bucket.DurabilityLevel.NONE if self.num_replicas != Bucket.ReplicaNum.THREE: random.seed(round(time() * 1000)) # Since durability is not supported with replicas=3 d_level = choice([ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY, Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]) return { # Scope/Collection ops params MetaCrudParams.COLLECTIONS_TO_DROP: 3, MetaCrudParams.SCOPES_TO_DROP: 1, MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET: 3, MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES: 5, MetaCrudParams.COLLECTIONS_TO_ADD_PER_BUCKET: 10, MetaCrudParams.BUCKET_CONSIDERED_FOR_OPS: "all", MetaCrudParams.SCOPES_CONSIDERED_FOR_OPS: "all", MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_OPS: "all", # Doc loading params "doc_crud": { MetaCrudParams.DocCrud.COMMON_DOC_KEY: "test_collections", MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS: 5000, MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION: 20, MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION: 10, MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION: 10, MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION: 10, }, # Doc_loading task options MetaCrudParams.DOC_TTL: doc_ttl, MetaCrudParams.DURABILITY_LEVEL: d_level, MetaCrudParams.SKIP_READ_ON_ERROR: True, MetaCrudParams.SUPPRESS_ERROR_TABLE: False, # The below is to skip populating success dictionary for reads MetaCrudParams.SKIP_READ_SUCCESS_RESULTS: True, MetaCrudParams.RETRY_EXCEPTIONS: [], MetaCrudParams.IGNORE_EXCEPTIONS: [], MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD: "all", MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD: "all", MetaCrudParams.BUCKETS_CONSIDERED_FOR_CRUD: "all" } @property def num_nodes_to_be_failover(self): def is_safe_to_fo(service): is_safe = False # Reference doc: # https://docs.couchbase.com/server/7.0/learn/clusters-and-availability/automatic-failover.html#failover-policy # Service / Data loss check if service == CbServer.Services.KV: if self.min_bucket_replica > 0 \ and node_count[CbServer.Services.KV] > 2: is_safe = True # elif service == CbServer.Services.INDEX: # if node_count[CbServer.Services.INDEX] > 1: # is_safe = True else: # All other services require at least 2 nodes to FO if node_count[service] > 1: is_safe = True return is_safe def decr_node_count(service): node_count[service] -= 1 if service == CbServer.Services.KV: self.min_bucket_replica -= 1 fo_nodes = set() num_unreachable_nodes = 0 active_cluster_nodes = len(self.rest.get_nodes(inactive=False)) total_nodes = active_cluster_nodes + self.fo_events + self.nodes_in min_nodes_for_quorum = int(total_nodes / 2) + 1 max_allowed_unreachable_nodes = total_nodes - min_nodes_for_quorum # Quorum check before checking individual services for _, failure_type in self.nodes_to_fail.items(): if failure_type in ["stop_couchbase", "network_split"]: num_unreachable_nodes += 1 if num_unreachable_nodes > max_allowed_unreachable_nodes: return 0 # End of quorum check node_count = dict() node_count[CbServer.Services.KV] = len(self.cluster.kv_nodes) node_count[CbServer.Services.INDEX] = len(self.cluster.index_nodes) node_count[CbServer.Services.N1QL] = len(self.cluster.query_nodes) node_count[CbServer.Services.EVENTING] = len( self.cluster.eventing_nodes) node_count[CbServer.Services.BACKUP] = len(self.cluster.backup_nodes) kv_nodes = dict() non_kv_nodes = dict() for node, failure_type in self.nodes_to_fail.items(): if CbServer.Services.KV in node.services: kv_nodes[node] = failure_type else: non_kv_nodes[node] = failure_type kv_service = CbServer.Services.KV for node, failure_type in kv_nodes.items(): if kv_service in node.services: # KV takes priority over other nodes in deciding the Auto-FO if self.max_count > (len(fo_nodes) + self.fo_events) \ and is_safe_to_fo(kv_service): fo_nodes.add(node) for service_type in node.services: # Decrement the node count for the service decr_node_count(service_type) else: self.log.warning("KV failover not possible") # No nodes should be FO'ed if KV FO is not possible fo_nodes = set() # Break to make sure no other service failover # will be expected break else: nodes_not_failed = set() for node, failure_type in non_kv_nodes.items(): # For other nodes, we need to check if the node running # other services are also safe to failover for service_type in node.services: if self.max_count == (len(fo_nodes) + self.fo_events): # Check to see whether the max_fo count is reached self.log.info("Max auto-fo count already reached") break if not is_safe_to_fo(service_type): self.log.warning("Service '%s' not safe to failover" % service_type) for t_node in fo_nodes: if service_type in t_node.services \ and kv_service not in t_node.services: nodes_not_failed.add(t_node) break else: fo_nodes.add(node) for service_type in node.services: # Decrement the node count for the service decr_node_count(service_type) fo_nodes = fo_nodes.difference(nodes_not_failed) expected_num_nodes = len(fo_nodes) self.log.info("Expected nodes to be failed over: %d" % expected_num_nodes) return expected_num_nodes def __get_server_obj(self, node): for server in self.cluster.servers: if server.ip == node.ip: return server def __update_server_obj(self): temp_data = self.nodes_to_fail self.nodes_to_fail = dict() for node_obj, fo_type in temp_data.items(): self.nodes_to_fail[self.__get_server_obj(node_obj)] = fo_type def __load_initial_collection_data(self): load_spec = self.__get_collection_load_spec() load_spec[MetaCrudParams.SCOPES_TO_DROP] = 0 load_spec[MetaCrudParams.COLLECTIONS_TO_DROP] = 0 load_spec[MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET] = 2 load_spec[MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES] = 5 load_spec["doc_crud"][ MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS] = 10000 def __perform_doc_ops(self, durability=None, validate_num_items=True): load_spec = self.__get_collection_load_spec() if durability and self.num_replicas != Bucket.ReplicaNum.THREE: load_spec[MetaCrudParams.DURABILITY_LEVEL] = durability self.log.info("Performing doc_ops with durability level=%s" % load_spec[MetaCrudParams.DURABILITY_LEVEL]) doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, load_spec, mutation_num=0, batch_size=self.batch_size, process_concurrency=self.process_concurrency) if doc_loading_task.result is False: self.fail("Collection CRUDs failure") if validate_num_items: # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets, timeout=1200) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) def get_nodes_to_fail(self, services_to_fail, dynamic_fo_method=False): nodes = dict() # Update the list of service-nodes mapping in the cluster object self.cluster_util.update_cluster_nodes_service_list(self.cluster) nodes_in_cluster = self.rest.get_nodes() for services in services_to_fail: node_services = set(services.split("_")) for index, node in enumerate(nodes_in_cluster): if node_services == set(node.services): fo_type = self.failover_method if dynamic_fo_method: fo_type = "stop_couchbase" if CbServer.Services.KV in node_services: fo_type = CouchbaseError.STOP_MEMCACHED nodes[node] = fo_type # Remove the node to be failed to avoid double insertion nodes_in_cluster.pop(index) break return nodes def validate_failover_settings(self, enabled, timeout, count, max_count): settings = self.rest.get_autofailover_settings() self.auto_fo_settings_tbl.rows = list() self.auto_fo_settings_tbl.rows.append([ str(settings.enabled), str(settings.count), str(settings.maxCount), str(settings.timeout), str(settings.failoverOnDataDiskIssuesEnabled), str(settings.failoverOnDataDiskIssuesTimeout) ]) self.auto_fo_settings_tbl.display("Auto failover status:") err_msg = "Mismatch in '%s' field. " \ "Cluster FO data: " + str(settings.__dict__) self.assertEqual(settings.enabled, enabled, err_msg % "enabled") self.assertEqual(settings.timeout, timeout, err_msg % "timeout") self.assertEqual(settings.count, count, err_msg % "count") self.assertEqual(settings.maxCount, max_count, err_msg % "maxCount") def __display_failure_node_status(self, message): self.test_status_tbl.rows = list() cluster_nodes = self.rest.get_nodes(inactive=True) for node, fo_type in self.nodes_to_fail.items(): node = [ t_node for t_node in cluster_nodes if t_node.ip == node.ip ][0] self.test_status_tbl.add_row([ node.ip, ",".join(node.services), node.clusterMembership, fo_type ]) self.test_status_tbl.display(message) def __update_unaffected_node(self): cluster_nodes = self.rest.get_nodes() for cluster_node in cluster_nodes: for failure_node in self.nodes_to_fail: if cluster_node.ip == failure_node.ip: break else: self.orchestrator = cluster_node self.rest = RestConnection(self.orchestrator) self.cluster.master = cluster_node self.log.info("Node for REST APIs: %s" % cluster_node.ip) break def test_max_events_range(self): """ - Try setting max_events 1 to 100 (Valid) - Try setting 0 > max_events > 100 (Invalid - negative) - Current timeout_range (5-120seconds) should work" """ self.log.info("Testing max_event counts") enable_failover = True timeout_val = 10 max_plus_1 = CbServer.Failover.MAX_EVENTS + 1 # Set max_events between (min, max) for num_events in range(CbServer.Failover.MIN_EVENTS, max_plus_1): status = self.rest.update_autofailover_settings( enable_failover, timeout_val, maxCount=num_events) self.assertTrue(status, "Failed to set max events=%s" % num_events) self.validate_failover_settings(enable_failover, timeout_val, 0, num_events) for num_events in [0, max_plus_1]: self.log.info("Testing max_event_count=%s" % num_events) status = self.rest.update_autofailover_settings( enable_failover, timeout_val, maxCount=max_plus_1) self.assertFalse(status, "Able to set max events=%s" % num_events) self.validate_failover_settings(enable_failover, timeout_val, 0, CbServer.Failover.MAX_EVENTS) def __run_test(self): # Validate count before the start of failover procedure self.validate_failover_settings(True, self.timeout, self.fo_events, self.max_count) # Before failure - nodes' information self.__display_failure_node_status("Nodes to be failed") try: rest_nodes = self.rest.get_nodes() if self.current_fo_strategy == CbServer.Failover.Type.AUTO: expected_fo_nodes = self.num_nodes_to_be_failover self.fo_events += expected_fo_nodes self.__update_server_obj() failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=self.fo_events, task_type="induce_failure") self.task_manager.add_new_task(failover_task) self.task_manager.get_task_result(failover_task) if failover_task.result is False: self.fail("Failure during concurrent failover procedure") elif self.current_fo_strategy == CbServer.Failover.Type.GRACEFUL: for node in self.nodes_to_fail: node = [ t_node for t_node in rest_nodes if t_node.ip == node.ip ][0] status = self.rest.fail_over(node.id, graceful=True) if status is False: self.fail("Graceful failover failed for %s" % node) self.sleep(5, "Wait for failover to start") reb_result = self.rest.monitorRebalance() self.assertTrue(reb_result, "Graceful failover failed") elif self.current_fo_strategy == CbServer.Failover.Type.FORCEFUL: for node in self.nodes_to_fail: node = [ t_node for t_node in rest_nodes if t_node.ip == node.ip ][0] status = self.rest.fail_over(node.id, graceful=False) if status is False: self.fail("Hard failover failed for %s" % node) self.sleep(5, "Wait for failover to start") reb_result = self.rest.monitorRebalance() self.assertTrue(reb_result, "Hard failover failed") except Exception as e: self.log.error("Exception occurred: %s" % str(e)) finally: # Disable auto-fo after the expected time limit self.rest.update_autofailover_settings(enabled=False, timeout=self.timeout, maxCount=self.max_count) if self.current_fo_strategy == CbServer.Failover.Type.AUTO: failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, task_type="revert_failure") self.task_manager.add_new_task(failover_task) self.task_manager.get_task_result(failover_task) if failover_task.result is False: self.fail("Failure during failover operation") # Enable back prev auto_fo settings self.sleep(15, "Wait before enabling back auto-fo") self.rest.update_autofailover_settings(enabled=True, timeout=self.timeout, maxCount=self.max_count) # After failure - failed nodes' information self.__display_failure_node_status("Nodes status failure") self.bucket_util.print_bucket_stats(self.cluster) # Validate count at the end of failover procedure self.validate_failover_settings(True, self.timeout, self.fo_events, self.max_count) def test_concurrent_failover(self): """ Common code to run failover tests """ self.current_fo_strategy = None load_data_after_fo = self.input.param("post_failover_data_load", True) exception = None for index, services_to_fo in enumerate(self.failover_order): self.current_fo_strategy = self.failover_type[index] # servers_to_fail -> kv:index / kv:index_kv / index:n1ql services_to_fo = services_to_fo.split(":") # servers_to_fail -> [kv, index] / [kv, index_kv] self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo) self.__update_unaffected_node() try: self.__run_test() except Exception as e: # Making sure to remove failed nodes before failing the test self.cluster_util.rebalance(self.cluster) self.fail("Exception occurred: %s" % str(e)) # Perform collection crud + doc_ops before rebalance operation if load_data_after_fo: try: self.__perform_doc_ops(durability="NONE", validate_num_items=False) except Exception as e: exception = e break self.sleep(20, "Wait for failed nodes to recover completely") if choice([True, False]): # Add back all nodes and rebalance self.log.info("Performing node add back operation") rest_nodes = self.rest.get_nodes(inactive=True) for node in rest_nodes: if node.clusterMembership == "inactiveFailed": self.rest.add_back_node(node.id) if CbServer.Services.KV in node.services: self.rest.set_recovery_type(node.id, "delta") result = self.cluster_util.rebalance(self.cluster) else: # Eject nodes and rebalance self.log.info("Ejecting all failed nodes from the cluster") result = self.cluster_util.rebalance(self.cluster) if exception: self.fail(exception) self.assertTrue(result, "Final rebalance failed") # Validate count is reset back to 0 after rebalance operation self.validate_failover_settings(True, self.timeout, 0, self.max_count) # Perform collection crud + doc_ops if load_data_after_fo: durability_val = None for bucket in self.cluster.buckets: # If we have bucket_replica=3, force use level=NONE if bucket.replicaNumber == Bucket.ReplicaNum.THREE: durability_val = Bucket.DurabilityLevel.NONE break # If we have ephemeral bucket, force use level=MAJORITY if bucket.bucketType == Bucket.Type.EPHEMERAL: durability_val = Bucket.DurabilityLevel.MAJORITY self.__perform_doc_ops(durability=durability_val) def test_split_brain(self): """ Test params: split_nodes - Accepts string of pattern 'a_b:c-b_a:d' This creates a barriers like, Node running services a_b & c to ignore anything from nodes running services b_a & d and vice versa """ def get_nodes_based_on_services(services): nodes = list() services = services.split(":") for t_service in services: t_service = t_service.split("_") t_service.sort() for c_node in cluster_nodes: if c_node.services == t_service: nodes.append(self.__get_server_obj(c_node)) # Remove nodes from cluster_nodes once picked # to avoid picking same node again cluster_nodes.remove(c_node) break return nodes def create_split_between_nodes(dest_nodes, src_nodes): for ssh_node in dest_nodes: shell_conn = RemoteMachineShellConnection(ssh_node) for src_node in src_nodes: shell_conn.execute_command( "iptables -A INPUT -s %s -j DROP" % src_node.ip) shell_conn.disconnect() def get_num_nodes_to_fo(num_nodes_affected, service_count_affected_nodes, service_count_unaffected_nodes): nodes_to_fo = num_nodes_affected for t_server, count in service_count_affected_nodes.items(): if t_server not in service_count_unaffected_nodes \ or service_count_unaffected_nodes[t_server] < 1: nodes_to_fo -= service_count_affected_nodes[t_server] return nodes_to_fo def recover_from_split(node_list): self.log.info("Flushing iptables rules from all nodes") for ssh_node in node_list: ssh_shell = RemoteMachineShellConnection(ssh_node) ssh_shell.execute_command("iptables -F") ssh_shell.disconnect() self.sleep(5, "Wait for nodes to be reachable") def post_failover_procedure(): self.rest.monitorRebalance() self.validate_failover_settings(True, self.timeout, num_nodes_to_fo, self.max_count) recover_from_split(node_split_1 + node_split_2) self.log.info("Rebalance out failed nodes") rebalance_res = self.cluster_util.rebalance(self.cluster) self.assertTrue(rebalance_res, "Post failover rebalance failed") # Validate failover count reset post rebalance self.validate_failover_settings(True, self.timeout, 0, self.max_count) fo_happens = self.input.param("fo_happens", True) nodes_to_split = self.input.param("split_nodes", None).split('-') if nodes_to_split is None: self.fail("Nothing to test. split_nodes is None") # Validate count before the start of failover procedure self.validate_failover_settings(True, self.timeout, self.fo_events, self.max_count) self.log.info("Fetching current cluster_nodes") self.cluster_util.find_orchestrator(self.cluster) # cluster_nodes holds servers which are not yet selected for nw split cluster_nodes = self.rest.get_nodes() for node in cluster_nodes: node.services.sort() # Fetch actual nodes from given service list to create a split node_split_1 = get_nodes_based_on_services(nodes_to_split[0]) node_split_2 = get_nodes_based_on_services(nodes_to_split[1]) service_count = [dict(), dict()] for index, split_services in enumerate(nodes_to_split): for node_services in nodes_to_split[index].split(':'): for service in node_services.split("_"): if service not in service_count[index]: service_count[index][service] = 0 service_count[index][service] += 1 if len(node_split_1) > len(node_split_2): num_nodes_to_fo = get_num_nodes_to_fo(len(node_split_2), service_count[1], service_count[0]) else: num_nodes_to_fo = get_num_nodes_to_fo(len(node_split_1), service_count[0], service_count[1]) self.log.info( "N/w split between -> [%s] || [%s]. Expect %s fo_events" % ([n.ip for n in node_split_1], [n.ip for n in node_split_2 ], num_nodes_to_fo)) try: create_split_between_nodes(node_split_1, node_split_2) create_split_between_nodes(node_split_2, node_split_1) self.sleep(self.timeout, "Wait for configured fo_timeout") self.sleep(15, "Extra sleep to avoid fail results") if fo_happens: self.log.info("Expecting failover to be triggered") post_failover_procedure() elif len([ t_serv for t_serv in self.services_init.split("-") if CbServer.Services.KV in t_serv ]) > 2: self.log.info("Expecting no failover will be triggered") self.validate_failover_settings(True, self.timeout, 0, self.max_count) if (self.nodes_init % 2) == 1: # Pick new master based on split network new_master = node_split_1[0] if len(node_split_2) > len(node_split_1): new_master = node_split_2[0] self.sleep(10, "FO expected wrt node %s" % new_master.ip) self.rest = RestConnection(new_master) self.cluster.master = new_master post_failover_procedure() finally: recover_from_split(node_split_1 + node_split_2) self.sleep(5, "Wait for n/w split to heal") rest_nodes = self.rest.get_nodes(inactive=True) for t_node in rest_nodes: if t_node.clusterMembership == "active": for node in self.cluster.servers: if node.ip == t_node.ip: self.cluster.master = node break break reb_result = self.cluster_util.rebalance(self.cluster) self.assertTrue(reb_result, "Final rebalance failed") def test_concurrent_failover_timer_reset(self): """ 1. Trigger failure on destined nodes 2. Wait for little less time than failover_timeout 3. Bring back few nodes back online for few seconds 4. Make sure no auto failover triggered till next failover timeout 5. Validate auto failovers after new timeout """ services_to_fo = self.failover_order[0].split(":") self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo, dynamic_fo_method=True) expected_fo_nodes = self.num_nodes_to_be_failover self.__update_server_obj() rand_node = choice(self.nodes_to_fail.keys()) self.__update_unaffected_node() self.__display_failure_node_status("Nodes to be failed") try: self.log.info("Starting auto-failover procedure") failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="induce_failure") self.task_manager.add_new_task(failover_task) self.sleep(int(self.timeout * 0.7), "Wait before bringing back the failed nodes") self.log.info("Bringing back '%s' for some time" % rand_node.ip) new_timer = None shell = RemoteMachineShellConnection(rand_node) cb_err = CouchbaseError(self.log, shell) if self.nodes_to_fail[rand_node] == CouchbaseError.STOP_MEMCACHED: cb_err.revert(CouchbaseError.STOP_MEMCACHED) self.sleep(10, "Wait before creating failure again") cb_err.create(CouchbaseError.STOP_MEMCACHED) new_timer = time() elif self.nodes_to_fail[rand_node] == "stop_couchbase": cb_err.revert(CouchbaseError.STOP_SERVER) self.sleep(10, "Wait before creating failure again") cb_err.create(CouchbaseError.STOP_SERVER) new_timer = time() shell.disconnect() # Validate the previous auto-failover task failed # due to the random_node coming back online self.task_manager.get_task_result(failover_task) self.assertFalse(failover_task.result, "Nodes failed over though nodes became active") # Validate auto_failover_settings self.validate_failover_settings(True, self.timeout, 0, self.max_count) # Make sure the new auto-failover timing is honoured new_timer = new_timer + self.timeout while int(time()) < new_timer: settings = self.rest.get_autofailover_settings() if settings.count != 0: self.fail("Nodes failed over before new failover time") self.sleep(10, "Wait for failover rebalance to trigger") self.rest.monitorRebalance() # Validate auto_failover_settings after actual auto failover self.validate_failover_settings(True, self.timeout, expected_fo_nodes, self.max_count) finally: # Recover all nodes from induced failures failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="revert_failure") self.task_manager.add_new_task(failover_task) self.task_manager.get_task_result(failover_task) self.log.info("Rebalance out the failed nodes") result = self.cluster_util.rebalance(self.cluster) self.assertTrue(result, "Final rebalance failed") # Perform collection crud + doc_ops after rebalance operation self.__perform_doc_ops() def test_failover_during_rebalance(self): """ 1. Start rebalance operation on the active cluster 2. Introduce failures on target nodes to trigger auto-failover 3. Validate rebalance succeeds after auto-fo trigger """ def get_reb_out_nodes(): nodes = list() nodes_with_services = dict() cluster_nodes = self.rest.get_nodes() for node in cluster_nodes: node.services.sort() d_key = '_'.join(node.services) if d_key not in nodes_with_services: nodes_with_services[d_key] = list() nodes_with_services[d_key].append(node) for services in out_nodes: services = services.split("_") services.sort() services = "_".join(services) rand_node = choice(nodes_with_services[services]) nodes_with_services[services].remove(rand_node) nodes.append(rand_node) return nodes self.nodes_in = self.input.param("nodes_in", 0) add_nodes = list() remove_nodes = list() # Format - kv:kv_index -> 2 nodes with services [kv, kv:index] out_nodes = self.input.param("out_nodes", "kv").split(":") # Can take any of (in/out/swap) rebalance_type = self.input.param("rebalance_type", "in") services_to_fo = self.failover_order[0].split(":") self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo, dynamic_fo_method=True) loader_task = None reader_task = None if rebalance_type == "in": add_nodes = self.cluster.servers[self.nodes_init:self.nodes_init + self.nodes_in] self.cluster.kv_nodes.extend(add_nodes) elif rebalance_type == "out": remove_nodes = get_reb_out_nodes() elif rebalance_type == "swap": remove_nodes = get_reb_out_nodes() add_nodes = self.cluster.servers[self.nodes_init:self.nodes_init + self.nodes_in] self.cluster.kv_nodes.extend(add_nodes) expected_fo_nodes = self.num_nodes_to_be_failover self.__update_server_obj() # Start doc_ops in background if self.load_during_fo: doc_gen = doc_generator("fo_docs", 0, 200000) loader_task = self.task.async_continuous_doc_ops( self.cluster, self.cluster.buckets[0], doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=5, process_concurrency=1) reader_task = self.task.async_continuous_doc_ops( self.cluster, self.cluster.buckets[0], doc_gen, DocLoading.Bucket.DocOps.READ, process_concurrency=1) self.__update_unaffected_node() self.__display_failure_node_status("Nodes to be failed") # Create Auto-failover task but won't start it failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="induce_failure") # Start rebalance operation self.log.info("Starting rebalance operation") rebalance_task = self.task.async_rebalance(self.cluster, to_add=add_nodes, to_remove=remove_nodes) self.sleep(max(10, 4 * self.nodes_in), "Wait for rebalance to start before failover") self.task_manager.add_new_task(failover_task) try: self.log.info("Wait for failover task to complete") self.task_manager.get_task_result(failover_task) failure_msg = "Auto-failover task failed" if expected_fo_nodes == 0: # Task is expected to fail since no failover is triggered self.assertFalse(failover_task.result, failure_msg) else: self.assertTrue(failover_task.result, failure_msg) finally: # Disable auto-fo after the expected time limit self.rest.update_autofailover_settings(enabled=False, timeout=self.timeout, maxCount=self.max_count) # Recover all nodes from induced failures recovery_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="revert_failure") self.task_manager.add_new_task(recovery_task) self.task_manager.get_task_result(recovery_task) self.task_manager.stop_task(rebalance_task) # Enable back prev auto_fo settings self.sleep(5, "Wait before enabling back auto-fo") self.rest.update_autofailover_settings(enabled=True, timeout=self.timeout, maxCount=self.max_count) # Validate auto_failover_settings after failover self.validate_failover_settings(True, self.timeout, expected_fo_nodes, self.max_count) # Stop background doc_ops if self.load_during_fo: for task in [loader_task, reader_task]: task.end_task() self.task_manager.get_task_result(task) # Perform collection crud + doc_ops before rebalance operation self.__perform_doc_ops(durability="NONE", validate_num_items=False) # Rebalance the cluster to remove failed nodes result = self.cluster_util.rebalance(self.cluster) self.assertTrue(result, "Rebalance failed") # Validate auto_failover_settings after rebalance operation self.validate_failover_settings(True, self.timeout, 0, self.max_count) # Perform collection crud + doc_ops after rebalance operation self.__perform_doc_ops()
class OPD: def __init__(self): pass def threads_calculation(self): self.process_concurrency = self.input.param("pc", self.process_concurrency) self.doc_loading_tm = TaskManager(self.process_concurrency) def get_memory_footprint(self): out = subprocess.Popen( ['ps', 'v', '-p', str(os.getpid())], stdout=subprocess.PIPE).communicate()[0].split(b'\n') vsz_index = out[0].split().index(b'RSS') mem = float(out[1].split()[vsz_index]) / 1024 self.PrintStep("RAM FootPrint: %s" % str(mem)) return mem def create_required_buckets(self, cluster): if self.cluster.cloud_cluster: return self.log.info("Get the available memory quota") rest = RestConnection(cluster.master) self.info = rest.get_nodes_self() # threshold_memory_vagrant = 100 kv_memory = self.info.memoryQuota - 100 # Creating buckets for data loading purpose self.log.info("Create CB buckets") self.bucket_expiry = self.input.param("bucket_expiry", 0) ramQuota = self.input.param("ramQuota", kv_memory) buckets = ["GleamBookUsers"] * self.num_buckets bucket_type = self.bucket_type.split(';') * self.num_buckets compression_mode = self.compression_mode.split(';') * self.num_buckets self.bucket_eviction_policy = self.bucket_eviction_policy for i in range(self.num_buckets): bucket = Bucket({ Bucket.name: buckets[i] + str(i), Bucket.ramQuotaMB: ramQuota / self.num_buckets, Bucket.maxTTL: self.bucket_expiry, Bucket.replicaNumber: self.num_replicas, Bucket.storageBackend: self.bucket_storage, Bucket.evictionPolicy: self.bucket_eviction_policy, Bucket.bucketType: bucket_type[i], Bucket.flushEnabled: Bucket.FlushBucket.ENABLED, Bucket.compressionMode: compression_mode[i], Bucket.fragmentationPercentage: self.fragmentation }) self.bucket_util.create_bucket(cluster, bucket) # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = rest.node_statuses() rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) rest.monitorRebalance() def create_required_collections(self, cluster, num_scopes, num_collections): self.scope_name = self.input.param("scope_name", "_default") if self.scope_name != "_default": self.bucket_util.create_scope(cluster, self.bucket, {"name": self.scope_name}) if num_scopes > 1: self.scope_prefix = self.input.param("scope_prefix", "VolumeScope") for bucket in cluster.buckets: for i in range(num_scopes): scope_name = self.scope_prefix + str(i) self.log.info("Creating scope: %s" % (scope_name)) self.bucket_util.create_scope(cluster.master, bucket, {"name": scope_name}) self.sleep(0.5) self.num_scopes += 1 for bucket in cluster.buckets: for scope in bucket.scopes.keys(): if num_collections > 0: self.collection_prefix = self.input.param( "collection_prefix", "VolumeCollection") for i in range(num_collections): collection_name = self.collection_prefix + str(i) self.bucket_util.create_collection( cluster.master, bucket, scope, {"name": collection_name}) self.sleep(0.5) self.collections = cluster.buckets[0].scopes[ self.scope_name].collections.keys() self.log.debug("Collections list == {}".format(self.collections)) def stop_purger(self, tombstone_purge_age=60): """ 1. Disable ts purger 2. Create fts indexes (to create metakv, ns_config entries) 3. Delete fts indexes 4. Grep ns_config for '_deleted' to get total deleted keys count 5. enable ts purger and age = 1 mins 6. Sleep for 2 minutes 7. Grep for debug.log and check for latest tombstones purged count 8. Validate step4 count matches step 7 count for all nodes """ self.rest.update_tombstone_purge_age_for_removal(tombstone_purge_age) self.rest.disable_tombstone_purger() def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) return dgm def _induce_error(self, error_condition, nodes=[]): nodes = nodes or [self.cluster.master] for node in nodes: if error_condition == "stop_server": self.cluster_util.stop_server(node) elif error_condition == "enable_firewall": self.cluster_util.start_firewall_on_node(node) elif error_condition == "kill_memcached": shell = RemoteMachineShellConnection(node) shell.kill_memcached() shell.disconnect() elif error_condition == "reboot_server": shell = RemoteMachineShellConnection(node) shell.reboot_node() elif error_condition == "kill_erlang": shell = RemoteMachineShellConnection(node) shell.kill_erlang() shell.disconnect() else: self.fail("Invalid error induce option") def _recover_from_error(self, error_condition): for node in self.cluster.nodes_in_cluster: if error_condition == "stop_server" or error_condition == "kill_erlang": self.cluster_util.start_server(node) elif error_condition == "enable_firewall": self.cluster_util.stop_firewall_on_node(node) for node in self.cluster.kv_nodes + [self.cluster.master]: self.check_warmup_complete(node) result = self.cluster_util.wait_for_ns_servers_or_assert( [node], wait_time=1200) self.assertTrue(result, "Server warmup failed") def rebalance(self, nodes_in=0, nodes_out=0, services=[], retry_get_process_num=3000): self.servs_in = list() self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) self.servs_out = list() services = services or ["kv"] print "KV nodes in cluster: %s" % [ server.ip for server in self.cluster.kv_nodes ] print "CBAS nodes in cluster: %s" % [ server.ip for server in self.cluster.cbas_nodes ] print "INDEX nodes in cluster: %s" % [ server.ip for server in self.cluster.index_nodes ] print "FTS nodes in cluster: %s" % [ server.ip for server in self.cluster.fts_nodes ] print "QUERY nodes in cluster: %s" % [ server.ip for server in self.cluster.query_nodes ] print "EVENTING nodes in cluster: %s" % [ server.ip for server in self.cluster.eventing_nodes ] print "AVAILABLE nodes for cluster: %s" % [ server.ip for server in self.available_servers ] if nodes_out: if "cbas" in services: servers = random.sample(self.cluster.cbas_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.cbas_nodes.remove(server) if "index" in services: servers = random.sample(self.cluster.index_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.index_nodes.remove(server) if "fts" in services: servers = random.sample(self.cluster.fts_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.fts_nodes.remove(server) if "query" in services: servers = random.sample(self.cluster.query_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.query_nodes.remove(server) if "eventing" in services: servers = random.sample(self.cluster.eventing_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.eventing_nodes.remove(server) if "kv" in services: nodes = [ node for node in self.cluster.kv_nodes if node.ip != self.cluster.master.ip ] servers = random.sample(nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.kv_nodes.remove(server) if nodes_in: if "cbas" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.cbas_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "index" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.index_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "fts" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.fts_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "query" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.query_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "eventing" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.eventing_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "kv" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.kv_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] print "Servers coming in : %s with services: %s" % ( [server.ip for server in self.servs_in], services) print "Servers going out : %s" % ( [server.ip for server in self.servs_out]) self.available_servers.extend(self.servs_out) print "NEW AVAILABLE nodes for cluster: %s" % ( [server.ip for server in self.available_servers]) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster, self.servs_in, self.servs_out, services=services, check_vbucket_shuffling=self.vbucket_check, retry_get_process_num=retry_get_process_num) return rebalance_task def generate_docs(self, doc_ops=None, create_end=None, create_start=None, update_end=None, update_start=None, delete_end=None, delete_start=None, expire_end=None, expire_start=None, read_end=None, read_start=None): self.get_memory_footprint() self.create_end = 0 self.create_start = 0 self.read_end = 0 self.read_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.initial_items = self.final_items doc_ops = doc_ops or self.doc_ops self.mutations_to_validate = doc_ops if "read" in doc_ops: if read_start is not None: self.read_start = read_start else: self.read_start = 0 if read_end is not None: self.read_end = read_end else: self.read_end = self.num_items * self.mutation_perc / 100 if "update" in doc_ops: if update_start is not None: self.update_start = update_start else: self.update_start = 0 if update_end is not None: self.update_end = update_end else: self.update_end = self.num_items * self.mutation_perc / 100 self.mutate += 1 if "delete" in doc_ops: if delete_start is not None: self.delete_start = delete_start else: self.delete_start = self.start if delete_end is not None: self.delete_end = delete_end else: self.delete_end = self.start + self.num_items * self.mutation_perc / 100 self.final_items -= (self.delete_end - self.delete_start ) * self.num_collections * self.num_scopes if "expiry" in doc_ops: if self.maxttl == 0: self.maxttl = self.input.param("maxttl", 10) if expire_start is not None: self.expire_start = expire_start else: self.expire_start = self.delete_end if expire_end is not None: self.expire_end = expire_end else: self.expire_end = self.expire_start + self.num_items * self.mutation_perc / 100 self.final_items -= (self.expire_end - self.expire_start ) * self.num_collections * self.num_scopes if "create" in doc_ops: if create_start is not None: self.create_start = create_start else: self.create_start = self.end self.start = self.create_start if create_end is not None: self.create_end = create_end else: self.create_end = self.end + ( self.expire_end - self.expire_start) + (self.delete_end - self.delete_start) self.end = self.create_end self.final_items += (abs(self.create_end - self.create_start) ) * self.num_collections * self.num_scopes print "Read Start: %s" % self.read_start print "Read End: %s" % self.read_end print "Update Start: %s" % self.update_start print "Update End: %s" % self.update_end print "Expiry Start: %s" % self.expire_start print "Expiry End: %s" % self.expire_end print "Delete Start: %s" % self.delete_start print "Delete End: %s" % self.delete_end print "Create Start: %s" % self.create_start print "Create End: %s" % self.create_end print "Final Start: %s" % self.start print "Final End: %s" % self.end def _loader_dict(self, cmd={}): self.loader_map = dict() for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue ws = WorkLoadSettings( cmd.get("keyPrefix", self.key), cmd.get("keySize", self.key_size), cmd.get("docSize", self.doc_size), cmd.get("cr", self.create_perc), cmd.get("rd", self.read_perc), cmd.get("up", self.update_perc), cmd.get("dl", self.delete_perc), cmd.get("ex", self.expiry_perc), cmd.get("workers", self.process_concurrency), cmd.get("ops", self.ops_rate), cmd.get("loadType", None), cmd.get("keyType", None), cmd.get("valueType", None), cmd.get("validate", False), cmd.get("gtm", False), cmd.get("deleted", False), cmd.get("mutated", 0)) hm = HashMap() hm.putAll({ DRConstants.create_s: self.create_start, DRConstants.create_e: self.create_end, DRConstants.update_s: self.update_start, DRConstants.update_e: self.update_end, DRConstants.expiry_s: self.expire_start, DRConstants.expiry_e: self.expire_end, DRConstants.delete_s: self.delete_start, DRConstants.delete_e: self.delete_end, DRConstants.read_s: self.read_start, DRConstants.read_e: self.read_end }) dr = DocRange(hm) ws.dr = dr dg = DocumentGenerator(ws, self.key_type, self.val_type) self.loader_map.update( {bucket.name + scope + collection: dg}) def wait_for_doc_load_completion(self, tasks, wait_for_stats=True): self.doc_loading_tm.getAllTaskResult() self.get_memory_footprint() for task in tasks: task.result = True unique_str = "{}:{}:{}:".format(task.sdk.bucket, task.sdk.scope, task.sdk.collection) for optype, failures in task.failedMutations.items(): for failure in failures: if failure is not None: print("Test Retrying {}: {}{} -> {}".format( optype, unique_str, failure.id(), failure.err().getClass().getSimpleName())) try: if optype == "create": task.docops.insert(failure.id(), failure.document(), task.sdk.connection, task.setOptions) if optype == "update": task.docops.upsert(failure.id(), failure.document(), task.sdk.connection, task.upsertOptions) if optype == "delete": task.docops.delete(failure.id(), task.sdk.connection, task.removeOptions) except (ServerOutOfMemoryException, TimeoutException) as e: print("Retry {} failed for key: {} - {}".format( optype, failure.id(), e)) task.result = False except (DocumentNotFoundException, DocumentExistsException) as e: pass try: task.sdk.disconnectCluster() except Exception as e: print(e) self.assertTrue(task.result, "Task Failed: {}".format(task.taskName)) if wait_for_stats: try: self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=14400) if self.track_failures: self.bucket_util.verify_stats_all_buckets(self.cluster, self.final_items, timeout=14400) except Exception as e: if not self.cluster.cloud_cluster: self.get_gdb() raise e def get_gdb(self): for node in self.cluster.nodes_in_cluster: gdb_shell = RemoteMachineShellConnection(node) gdb_out = gdb_shell.execute_command( 'gdb -p `(pidof memcached)` -ex "thread apply all bt" -ex detach -ex quit' )[0] print node.ip print gdb_out gdb_shell.disconnect() def data_validation(self): self.get_memory_footprint() doc_ops = self.mutations_to_validate pc = min(self.process_concurrency, 20) if self._data_validation: self.log.info("Validating Active/Replica Docs") cmd = dict() self.ops_rate = self.input.param("ops_rate", 2000) master = Server(self.cluster.master.ip, self.cluster.master.port, self.cluster.master.rest_username, self.cluster.master.rest_password, str(self.cluster.master.memcached_port)) self.loader_map = dict() for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue for op_type in doc_ops: cmd.update({"deleted": False}) hm = HashMap() if op_type == "create": hm.putAll({ DRConstants.read_s: self.create_start, DRConstants.read_e: self.create_end }) elif op_type == "update": hm.putAll({ DRConstants.read_s: self.update_start, DRConstants.read_e: self.update_end }) elif op_type == "delete": hm.putAll({ DRConstants.read_s: self.delete_start, DRConstants.read_e: self.delete_end }) cmd.update({"deleted": True}) else: continue dr = DocRange(hm) ws = WorkLoadSettings( cmd.get("keyPrefix", self.key), cmd.get("keySize", self.key_size), cmd.get("docSize", self.doc_size), cmd.get("cr", 0), cmd.get("rd", 100), cmd.get("up", 0), cmd.get("dl", 0), cmd.get("ex", 0), cmd.get("workers", pc), cmd.get("ops", self.ops_rate), cmd.get("loadType", None), cmd.get("keyType", None), cmd.get("valueType", None), cmd.get("validate", True), cmd.get("gtm", False), cmd.get("deleted", False), cmd.get("mutated", 0)) ws.dr = dr dg = DocumentGenerator(ws, self.key_type, self.val_type) self.loader_map.update({ bucket.name + scope + collection + op_type: dg }) tasks = list() i = pc while i > 0: for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[ scope].collections.keys(): if collection == "_default" and scope == "_default": continue for op_type in doc_ops: if op_type not in [ "create", "update", "delete" ]: continue client = NewSDKClient(master, bucket.name, scope, collection) client.initialiseSDK() self.sleep(1) taskName = "Validate_%s_%s_%s_%s_%s_%s" % ( bucket.name, scope, collection, op_type, str(i), time.time()) task = WorkLoadGenerate( taskName, self.loader_map[bucket.name + scope + collection + op_type], client, "NONE", self.maxttl, self.time_unit, self.track_failures, 0) tasks.append(task) self.doc_loading_tm.submit(task) i -= 1 self.doc_loading_tm.getAllTaskResult() for task in tasks: try: task.sdk.disconnectCluster() except Exception as e: print(e) for task in tasks: self.assertTrue(task.result, "Validation Failed for: %s" % task.taskName) self.get_memory_footprint() def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers([ "Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted", "Items Expired" ]) self.table.add_row([ str(self.initial_items), str(self.final_items), str(abs(self.update_start)) + "-" + str(abs(self.update_end)), str(abs(self.create_start)) + "-" + str(abs(self.create_end)), str(abs(self.delete_start)) + "-" + str(abs(self.delete_end)), str(abs(self.expire_start)) + "-" + str(abs(self.expire_end)) ]) self.table.display("Docs statistics") def perform_load(self, crash=False, num_kills=1, wait_for_load=True, validate_data=True): self.get_memory_footprint() self._loader_dict() master = Server(self.cluster.master.ip, self.cluster.master.port, self.cluster.master.rest_username, self.cluster.master.rest_password, str(self.cluster.master.memcached_port)) tasks = list() i = self.process_concurrency while i > 0: for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue client = NewSDKClient(master, bucket.name, scope, collection) client.initialiseSDK() self.sleep(1) self.get_memory_footprint() taskName = "Loader_%s_%s_%s_%s_%s" % ( bucket.name, scope, collection, str(i), time.time()) task = WorkLoadGenerate( taskName, self.loader_map[bucket.name + scope + collection], client, self.durability_level, self.maxttl, self.time_unit, self.track_failures, 0) tasks.append(task) self.doc_loading_tm.submit(task) i -= 1 if wait_for_load: self.wait_for_doc_load_completion(tasks) self.get_memory_footprint() else: return tasks if crash: self.kill_memcached(num_kills=num_kills) if validate_data: self.data_validation() self.print_stats() if self.cluster.cloud_cluster: return result = self.check_coredump_exist(self.cluster.nodes_in_cluster) if result: self.PrintStep("CRASH | CRITICAL | WARN messages found in cb_logs") if self.assert_crashes_on_load: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse(result) def get_magma_disk_usage(self, bucket=None): if bucket is None: bucket = self.bucket servers = self.cluster.nodes_in_cluster kvstore = 0 wal = 0 keyTree = 0 seqTree = 0 data_files = 0 for server in servers: shell = RemoteMachineShellConnection(server) bucket_path = os.path.join( RestConnection(server).get_data_path(), bucket.name) kvstore += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*"))[0][0].split('\n')[0]) wal += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/wal"))[0][0].split('\n')[0]) keyTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*/rev*/key*"))[0][0].split('\n')[0]) seqTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*/rev*/seq*"))[0][0].split('\n')[0]) cmd = 'find ' + bucket_path + '/magma*/ -maxdepth 1 -type d \ -print0 | while read -d "" -r dir; do files=("$dir"/*/*/*); \ printf "%d,%s\n" "${#files[@]}" "$dir"; done' data_files = shell.execute_command(cmd)[0] for files in data_files: if "kvstore" in files and int(files.split(",")[0]) >= 300: self.log.warn("Number of files in {}--{} is {}".format( server.ip, files.split(",")[1].rstrip(), files.split(",")[0])) shell.disconnect() self.log.debug("Total Disk usage for kvstore is {}MB".format(kvstore)) self.log.debug("Total Disk usage for wal is {}MB".format(wal)) self.log.debug("Total Disk usage for keyTree is {}MB".format(keyTree)) self.log.debug("Total Disk usage for seqTree is {}MB".format(seqTree)) return kvstore, wal, keyTree, seqTree def print_stats(self): self.bucket_util.print_bucket_stats(self.cluster) self.cluster_util.print_cluster_stats(self.cluster) self.print_crud_stats() for bucket in self.cluster.buckets: self.get_bucket_dgm(bucket) if bucket.storageBackend == Bucket.StorageBackend.magma and not self.cluster.cloud_cluster: self.get_magma_disk_usage(bucket) self.check_fragmentation_using_magma_stats(bucket) self.check_fragmentation_using_kv_stats(bucket) def PrintStep(self, msg=None): print "\n" print "\t", "#" * 60 print "\t", "#" print "\t", "# %s" % msg print "\t", "#" print "\t", "#" * 60 print "\n" def check_fragmentation_using_kv_stats(self, bucket, servers=None): result = dict() if servers is None: servers = self.cluster.kv_nodes + [self.cluster.master] if type(servers) is not list: servers = [servers] for server in servers: frag_val = self.bucket_util.get_fragmentation_kv( self.cluster, bucket, server) self.log.debug("Current Fragmentation for node {} is {} \ ".format(server.ip, frag_val)) result.update({server.ip: frag_val}) self.log.info("KV stats fragmentation values {}".format(result)) def dump_magma_stats(self, server, bucket, shard, kvstore): if bucket.storageBackend != Bucket.StorageBackend.magma or self.cluster.cloud_cluster: return shell = RemoteMachineShellConnection(server) data_path = RestConnection(server).get_data_path() while not self.stop_stats: for bucket in self.cluster.buckets: self.log.info( self.get_magma_stats(bucket, server, "rw_0:magma")) self.dump_seq_index(shell, data_path, bucket.name, shard, kvstore) self.sleep(600) shell.disconnect() def dump_seq_index(self, shell, data_path, bucket, shard, kvstore): magma_path = os.path.join(data_path, bucket, "magma.{}") magma = magma_path.format(shard) cmd = '/opt/couchbase/bin/magma_dump {}'.format(magma) cmd += ' --kvstore {} --tree seq'.format(kvstore) result = shell.execute_command(cmd)[0] self.log.info("Seq Tree for {}:{}:{}:{}: \n{}".format( shell.ip, bucket, shard, kvstore, result)) def check_fragmentation_using_magma_stats(self, bucket, servers=None): result = dict() stats = list() if servers is None: servers = self.cluster.kv_nodes + [self.cluster.master] if type(servers) is not list: servers = [servers] for server in servers: fragmentation_values = list() shell = RemoteMachineShellConnection(server) output = shell.execute_command( "lscpu | grep 'CPU(s)' | head -1 | awk '{print $2}'" )[0][0].split('\n')[0] shell.disconnect() self.log.debug("machine: {} - core(s): {}".format( server.ip, output)) for i in range(min(int(output), 64)): grep_field = "rw_{}:magma".format(i) _res = self.get_magma_stats(bucket, server) fragmentation_values.append( json.loads(_res[server.ip][grep_field])["Fragmentation"]) stats.append(_res) result.update({server.ip: fragmentation_values}) self.log.info(stats[0]) res = list() for value in result.values(): res.append(max(value)) if max(res) < float(self.fragmentation) / 100: self.log.info("magma stats fragmentation result {} \ ".format(result)) return True self.log.info("magma stats fragmentation result {} \ ".format(result)) return False def get_magma_stats(self, bucket, server=None): magma_stats_for_all_servers = dict() cbstat_obj = Cbstats(server) result = cbstat_obj.magma_stats(bucket.name) magma_stats_for_all_servers[server.ip] = result return magma_stats_for_all_servers def pause_rebalance(self): rest = RestConnection(self.cluster.master) i = 1 self.sleep(10, "Let the rebalance begin!") expected_progress = 20 while expected_progress < 100: expected_progress = 20 * i reached = self.cluster_util.rebalance_reached( rest, expected_progress) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format( expected_progress)) if not self.cluster_util.is_cluster_rebalanced(rest): self.log.info("Stop the rebalance") stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="Unable to stop rebalance") rebalance_task = self.task.async_rebalance( self.cluster, [], [], retry_get_process_num=3000) self.sleep( 10, "Rebalance % ={}. Let the rebalance begin!".format( expected_progress)) i += 1 return rebalance_task def abort_rebalance(self, rebalance, error_type="kill_memcached"): self.sleep(30, "Let the rebalance begin!") rest = RestConnection(self.cluster.master) i = 1 expected_progress = 20 rebalance_task = rebalance while expected_progress < 80: expected_progress = 20 * i reached = self.cluster_util.rebalance_reached(rest, expected_progress, wait_step=10, num_retry=3600) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format( expected_progress)) if not self.cluster_util.is_cluster_rebalanced(rest): self.log.info("Abort rebalance") self._induce_error(error_type, self.cluster.nodes_in_cluster) result = self.check_coredump_exist( self.cluster.nodes_in_cluster) if result: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") self.sleep(60, "Sleep after error introduction") self._recover_from_error(error_type) result = self.check_coredump_exist( self.cluster.nodes_in_cluster) if result: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") try: self.task_manager.get_task_result(rebalance_task) except RebalanceFailedException: pass if rebalance.result: self.log.error( "Rebalance passed/finished which is not expected") self.log.info( "Rebalance % after rebalance finished = {}".format( expected_progress)) return None else: self.log.info( "Restarting Rebalance after killing at {}".format( expected_progress)) rebalance_task = self.task.async_rebalance( self.cluster, [], self.servs_out, retry_get_process_num=3000) self.sleep(120, "Let the rebalance begin after abort") self.log.info("Rebalance % = {}".format( self.rest._rebalance_progress())) i += 1 return rebalance_task def crash_memcached(self, nodes=None, num_kills=1, graceful=False): self.stop_crash = False self.crash_count = 0 if not nodes: nodes = self.cluster.kv_nodes + [self.cluster.master] while not self.stop_crash: self.get_memory_footprint() sleep = random.randint(60, 120) self.sleep( sleep, "Iteration:{} waiting to kill memc on all nodes".format( self.crash_count)) self.kill_memcached(nodes, num_kills=num_kills, graceful=graceful, wait=True) self.crash_count += 1 if self.crash_count > self.crashes: self.stop_crash = True self.sleep(300) def kill_memcached(self, servers=None, num_kills=1, graceful=False, wait=True): if not servers: servers = self.cluster.kv_nodes + [self.cluster.master] for server in servers: for _ in xrange(num_kills): if num_kills > 1: self.sleep( 2, "Sleep for 2 seconds b/w cont memc kill on same node.") shell = RemoteMachineShellConnection(server) if graceful: shell.restart_couchbase() else: shell.kill_memcached() shell.disconnect() self.sleep( 5, "Sleep for 5 seconds before killing memc on next node.") result = self.check_coredump_exist(self.cluster.nodes_in_cluster) if result: self.stop_crash = True self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") if wait: for server in servers: self.check_warmup_complete(server) def check_warmup_complete(self, server): for bucket in self.cluster.buckets: start_time = time.time() result = self.bucket_util._wait_warmup_completed( [server], self.cluster.buckets[0], wait_time=self.wait_timeout * 20) if not result: self.stop_crash = True self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertTrue( result, "Warm-up failed in %s seconds" % (self.wait_timeout * 20)) else: self.log.info("Bucket:%s warm-up completed in %s." % (bucket.name, str(time.time() - start_time))) def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default", num_storage_threads="default"): bucket_helper = BucketHelper(self.cluster.master) bucket_helper.update_memcached_settings( num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads, num_storage_threads=num_storage_threads)
def test_doc_size(self): def check_durability_failures(): self.log.error(task.sdk_acked_curd_failed.keys()) self.log.error(task.sdk_exception_crud_succeed.keys()) self.assertTrue( len(task.sdk_acked_curd_failed) == 0, "Durability failed for docs: %s" % task.sdk_acked_curd_failed.keys()) self.assertTrue( len(task.sdk_exception_crud_succeed) == 0, "Durability failed for docs: %s" % task.sdk_acked_curd_failed.keys()) """ Basic tests for document CRUD operations using JSON docs """ doc_op = self.input.param("doc_op", None) def_bucket = self.bucket_util.buckets[0] ignore_exceptions = list() retry_exceptions = list() # Stat validation reference variables verification_dict = dict() ref_val = dict() ref_val["ops_create"] = 0 ref_val["ops_update"] = 0 ref_val["ops_delete"] = 0 ref_val["rollback_item_count"] = 0 ref_val["sync_write_aborted_count"] = 0 ref_val["sync_write_committed_count"] = 0 one_less_node = self.nodes_init == self.num_replicas if self.durability_level: pass #ignore_exceptions.append( # "com.couchbase.client.core.error.RequestTimeoutException") if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.vbuckets) self.log.info("Loading {0} docs into the bucket: {1}".format( self.num_items, def_bucket)) task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence) self.task.jython_task_manager.get_task_result(task) if self.ryow: check_durability_failures() # Retry doc_exception code self.log.info("Validating failed doc's (if any) exceptions") doc_op_info_dict = dict() doc_op_info_dict[task] = self.bucket_util.get_doc_op_info_dict( def_bucket, "create", exp=0, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout, time_unit="seconds", ignore_exceptions=ignore_exceptions, retry_exceptions=retry_exceptions) self.bucket_util.verify_doc_op_task_exceptions(doc_op_info_dict, self.cluster) if len(doc_op_info_dict[task]["unwanted"]["fail"].keys()) != 0: self.fail("Failures in retry doc CRUDs: {0}".format( doc_op_info_dict[task]["unwanted"]["fail"])) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Update ref_val ref_val["ops_create"] = self.num_items + len(task.fail.keys()) ref_val["sync_write_committed_count"] = self.num_items # Validate vbucket stats verification_dict["ops_create"] = ref_val["ops_create"] verification_dict["rollback_item_count"] = \ ref_val["rollback_item_count"] if self.durability_level: verification_dict["sync_write_aborted_count"] = \ ref_val["sync_write_aborted_count"] verification_dict["sync_write_committed_count"] = \ ref_val["sync_write_committed_count"] failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.vbuckets, expected_val=verification_dict, one_less_node=one_less_node) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.verify_stats_all_buckets(self.num_items) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) doc_update = doc_generator(self.key, 0, num_item_start_for_crud, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.vbuckets) expected_num_items = self.num_items num_of_mutations = 1 if doc_op == "update": self.log.info("Performing 'update' mutation over the docs") task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence) self.task.jython_task_manager.get_task_result(task) ref_val["ops_update"] = (doc_update.end - doc_update.start + len(task.fail.keys())) if self.durability_level: ref_val["sync_write_committed_count"] += \ (doc_update.end - doc_update.start) if self.ryow: check_durability_failures() # Read all the values to validate update operation task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_update, "read", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Update failed key", "CAS", "Value"]) for key, value in task.success.items(): if json.loads(str(value["value"]))["mutated"] != 1: op_failed_tbl.add_row([key, value["cas"], value["value"]]) op_failed_tbl.display("Update failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Update failed for few keys") elif doc_op == "delete": self.log.info("Performing 'delete' mutation over the docs") task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_update, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence) self.task.jython_task_manager.get_task_result(task) expected_num_items = self.num_items \ - (self.num_items - num_item_start_for_crud) ref_val["ops_delete"] = (doc_update.end - doc_update.start + len(task.fail.keys())) if self.durability_level: ref_val["sync_write_committed_count"] += \ (doc_update.end - doc_update.start) if self.ryow: check_durability_failures() # Read all the values to validate update operation task = self.task.async_load_gen_docs(self.cluster, def_bucket, doc_update, "read", 0, batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Delete failed key", "CAS", "Value"]) for key, value in task.success.items(): op_failed_tbl.add_row([key, value["cas"], value["value"]]) op_failed_tbl.display("Delete failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Delete failed for few keys") else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Validate vbucket stats verification_dict["ops_create"] = ref_val["ops_create"] verification_dict["ops_update"] = ref_val["ops_update"] verification_dict["ops_delete"] = ref_val["ops_delete"] verification_dict["rollback_item_count"] = \ ref_val["rollback_item_count"] if self.durability_level: verification_dict["sync_write_aborted_count"] = \ ref_val["sync_write_aborted_count"] verification_dict["sync_write_committed_count"] = \ ref_val["sync_write_committed_count"] failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.vbuckets, expected_val=verification_dict, one_less_node=one_less_node) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.verify_stats_all_buckets(expected_num_items)
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local method to validate vb_seqno def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for vb_id in range(self.cluster.vbuckets): vb_id = str(vb_id) if vb_id not in affected_vbs: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log_failure( "Unaffected vb-%s stat updated: %s != %s" % (vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.ACTIVE]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log.warning( err_msg % (node.ip, Bucket.vBucket.ACTIVE, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.REPLICA]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ == vb_info["post_timeout"][node.ip][vb_id]: retry_validation = True self.log.warning( err_msg % (node.ip, Bucket.vBucket.REPLICA, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets[Bucket.vBucket.ACTIVE] = list() target_nodes_vbuckets[Bucket.vBucket.REPLICA] = list() vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) target_nodes_vbuckets[Bucket.vBucket.ACTIVE] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.ACTIVE) target_nodes_vbuckets[Bucket.vBucket.REPLICA] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.REPLICA) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout target_vbs = target_nodes_vbuckets[Bucket.vBucket.ACTIVE] if self.nodes_init == 1: pass elif self.durability_level \ == Bucket.DurabilityLevel.PERSIST_TO_MAJORITY: target_vbs = target_nodes_vbuckets[Bucket.vBucket.REPLICA] # Create required doc_generators doc_gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.crud_batch_size, target_vbucket=target_vbs, key_size=self.key_size) doc_gen["remove"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=2, target_vbucket=target_vbs) doc_gen["read"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=0, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, int(self.num_items / 4), self.crud_batch_size, key_size=self.key_size, template_index=1, target_vbucket=target_vbs) for op_type in doc_gen.keys(): tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) for op_type in doc_gen.keys(): self.task_manager.add_new_task(tasks[op_type]) # Wait for document_loader tasks to complete for op_type in doc_gen.keys(): self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == DocLoading.Bucket.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in doc_gen.keys(): if op_type == DocLoading.Bucket.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets))) affected_vbs = list(set(affected_vbs)) err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s" # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # If replicas+1 == total nodes, verify no mutation should have # succeeded with durability if self.nodes_init == self.num_replicas + 1: read_gen = doc_generator(self.key, 0, self.num_items) read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, read_gen, DocLoading.Bucket.DocOps.READ, 0, batch_size=500, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) failed_keys = TableView(self.log.error) failed_keys.set_headers(["Key", "Error"]) half_of_num_items = self.num_items / 2 for doc_key, doc_info in read_task.success.items(): key_index = int(doc_key.split("-")[1]) expected_mutated_val = 0 if key_index < half_of_num_items: expected_mutated_val = 1 mutated = json.loads(str(doc_info["value"]))["mutated"] if mutated != expected_mutated_val: failed_keys.add_row([doc_key, doc_info]) failed_keys.display("Affected mutations:") self.log.error(read_task.fail) # Doc error validation for op_type in doc_gen.keys(): task = tasks[op_type] retry_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(retry_task) retry_failures = set(retry_task.fail.keys()) initial_failures = set(task.fail.keys()) if len(list(retry_failures.difference(initial_failures))) != 0: self.log_failure("Docs failed during retry task for %s: %s" % (op_type, retry_task.fail)) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
class volume(BaseTestCase): # will add the __init__ functions after the test has been stabilised def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket":False}) BaseTestCase.setUp(self) self.rest = RestConnection(self.servers[0]) self.op_type = self.input.param("op_type", "create") self.tasks = [] # To have all tasks running in parallel. self._iter_count = 0 # To keep a check of how many items are deleted self.available_servers = list() self.available_servers = self.cluster.servers[self.nodes_init:] self.num_buckets = self.input.param("num_buckets", 1) self.mutate = 0 self.doc_ops = self.input.param("doc_ops", None) if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.iterations = self.input.param("iterations", 2) self.vbucket_check = self.input.param("vbucket_check", True) self.new_num_writer_threads = self.input.param( "new_num_writer_threads", 6) self.new_num_reader_threads = self.input.param( "new_num_reader_threads", 8) def create_required_buckets(self): self.log.info("Get the available memory quota") self.info = self.rest.get_nodes_self() threshold_memory = 100 # threshold_memory_vagrant = 100 total_memory_in_mb = self.info.mcdMemoryReserved total_available_memory_in_mb = total_memory_in_mb active_service = self.info.services # If the mentioned service is already present, # we remove that much memory from available memory quota if "index" in active_service: total_available_memory_in_mb -= self.info.indexMemoryQuota if "fts" in active_service: total_available_memory_in_mb -= self.info.ftsMemoryQuota if "cbas" in active_service: total_available_memory_in_mb -= self.info.cbasMemoryQuota if "eventing" in active_service: total_available_memory_in_mb -= self.info.eventingMemoryQuota available_memory = total_available_memory_in_mb - threshold_memory # available_memory = total_available_memory_in_mb - threshold_memory_vagrant self.rest.set_service_memoryQuota(service='memoryQuota', memoryQuota=available_memory) # Creating buckets for data loading purpose self.log.info("Create CB buckets") duration = self.input.param("bucket_expiry", 0) eviction_policy = self.input.param("eviction_policy", Bucket.EvictionPolicy.VALUE_ONLY) self.bucket_type = self.input.param("bucket_type", Bucket.Type.MEMBASE) # Bucket.bucket_type.EPHEMERAL compression_mode = self.input.param("compression_mode", Bucket.CompressionMode.PASSIVE) # Bucket.bucket_compression_mode.ACTIVE ramQuota = self.input.param("ramQuota", available_memory) bucket_names = self.input.param("bucket_names", "GleamBookUsers") if bucket_names: bucket_names = bucket_names.split(';') if self.bucket_type: self.bucket_type = self.bucket_type.split(';') if compression_mode: compression_mode = compression_mode.split(';') if eviction_policy: eviction_policy = eviction_policy.split(';') if self.num_buckets == 1: bucket = Bucket({"name": "GleamBookUsers", "ramQuotaMB": ramQuota, "maxTTL": duration, "replicaNumber":self.num_replicas, "evictionPolicy": eviction_policy[0], "bucketType":self.bucket_type[0], "compressionMode":compression_mode[0]}) self.bucket_util.create_bucket(bucket) elif 1 < self.num_buckets == len(bucket_names): for i in range(self.num_buckets): bucket = Bucket({"name": bucket_names[i], "ramQuotaMB": ramQuota/self.num_buckets, "maxTTL": duration, "replicaNumber":self.num_replicas, "evictionPolicy": eviction_policy[i], "bucketType":self.bucket_type[i], "compressionMode":compression_mode[i]}) self.bucket_util.create_bucket(bucket) else: self.fail("Number of bucket/Names not sufficient") # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) self.rest.monitorRebalance() return bucket def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default"): for node in self.cluster_util.get_kv_nodes(): bucket_helper = BucketHelper(node) bucket_helper.update_memcached_settings(num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads) def volume_doc_generator_users(self, key, start, end): template = '{{ "id":"{0}", "alias":"{1}", "name":"{2}", "user_since":"{3}", "employment":{4} }}' return GleamBookUsersDocumentGenerator(key, template, start=start, end=end) def volume_doc_generator_messages(self, key, start, end): template = '{{ "message_id": "{0}", "author_id": "{1}", "send_time": "{2}" }}' return GleamBookMessagesDocumentGenerator(key, template, start=start, end=end) def initial_data_load(self, initial_load): if self.atomicity: task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets, initial_load, "create" , exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries,update_count=self.mutate, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit,durability=self.durability_level,sync=self.sync) self.task.jython_task_manager.get_task_result(task) else: tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, initial_load, "create", exp=0, persist_to = self.persist_to, replicate_to=self.replicate_to, batch_size= 10, pause_secs = 5, timeout_secs=30, durability=self.durability_level, process_concurrency = self.process_concurrency, retries=self.sdk_retries) for task, task_info in tasks_info.items(): self.task_manager.get_task_result(task) self.sleep(10) # Loading documents in 2 buckets in parallel through transactions def doc_load_using_txns(self): if "update" in self.doc_ops and self.gen_update_users is not None: self.tasks.append(self.doc_loader_txn("update", self.gen_update_users)) if "create" in self.doc_ops and self.gen_create_users is not None: self.tasks.append(self.doc_loader_txn("create", self.gen_create_users)) if "delete" in self.doc_ops and self.gen_delete_users is not None: self.tasks.append(self.doc_loader_txn("delete", self.gen_delete_users)) self.sleep(20) for task in self.tasks: self.task.jython_task_manager.get_task_result(task) def doc_loader_txn(self, op_type, kv_gen): if op_type == "update": print("Value of Mutated is", self.mutate) self.sleep(5) process_concurrency = self.process_concurrency # if op_type == "update": # if "create" not in self.doc_ops: # self.create_perc = 0 # if "delete" not in self.doc_ops: # self.delete_perc = 0 # process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) # if op_type == "create": # if "update" not in self.doc_ops: # self.update_perc = 0 # if "delete" not in self.doc_ops: # self.delete_perc = 0 # process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) # if op_type == "delete": # if "create" not in self.doc_ops: # self.create_perc = 0 # if "update" not in self.doc_ops: # self.update_perc = 0 # process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets, kv_gen, op_type, exp=0, batch_size=10, process_concurrency=process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, update_count=self.mutate, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level, sync=self.sync, defer=self.defer) return task # Loading documents through normal doc loader def normal_doc_loader(self): tasks_info = dict() if "update" in self.doc_ops and self.gen_update_users is not None: task_info = self.doc_loader("update", self.gen_update_users) tasks_info.update(task_info.items()) if "create" in self.doc_ops and self.gen_create_users is not None: task_info = self.doc_loader("create", self.gen_create_users) tasks_info.update(task_info.items()) if "delete" in self.doc_ops and self.gen_delete_users is not None: task_info = self.doc_loader("delete", self.gen_delete_users) tasks_info.update(task_info.items()) return tasks_info def doc_loader(self, op_type, kv_gen): process_concurrency = self.process_concurrency if op_type == "update": if "create" not in self.doc_ops: self.create_perc = 0 if "delete" not in self.doc_ops: self.delete_perc = 0 process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) if op_type == "create": if "update" not in self.doc_ops: self.update_perc = 0 if "delete" not in self.doc_ops: self.delete_perc = 0 process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) if op_type == "delete": if "create" not in self.doc_ops: self.create_perc = 0 if "update" not in self.doc_ops: self.update_perc = 0 process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) retry_exceptions = [ SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException, SDKException.DurabilityAmbiguousException, SDKException.DurabilityImpossibleException, ] tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, kv_gen, op_type, 0, batch_size=20, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=30, process_concurrency=process_concurrency, retries=self.sdk_retries, retry_exceptions=retry_exceptions) return tasks_info # Stopping and restarting the memcached process def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() def rebalance(self, nodes_in=0, nodes_out=0): servs_in = random.sample(self.available_servers, nodes_in) self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, nodes_out) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], servs_in, servs_out, check_vbucket_shuffling=self.vbucket_check) self.available_servers = [servs for servs in self.available_servers if servs not in servs_in] self.available_servers += servs_out self.cluster.nodes_in_cluster.extend(servs_in) self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) return rebalance_task def rebalance_validation(self, tasks_info, rebalance_task): if not rebalance_task.result: for task, _ in tasks_info.items(): self.task.jython_task_manager.get_task_result(task) self.fail("Rebalance Failed") def data_validation(self, tasks_info): if not self.atomicity: for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions(tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) self.sleep(10) for task, task_info in tasks_info.items(): self.assertFalse( task_info["ops_failed"], "Doc ops failed for task: {}".format(task.thread_name)) self.log.info("Validating Active/Replica Docs") if self.atomicity: self.check_replica = False else: self.check_replica = True for bucket in self.bucket_util.buckets: tasks = list() if self.gen_update_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_update_users, "update", 0, batch_size=10, check_replica=self.check_replica)) if self.gen_create_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_create_users, "create", 0, batch_size=10, check_replica=self.check_replica)) if self.gen_delete_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_delete_users, "delete", 0, batch_size=10, check_replica=self.check_replica)) for task in tasks: self.task.jython_task_manager.get_task_result(task) self.sleep(20) if not self.atomicity: self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.end - self.initial_load_count*self.delete_perc/100*self._iter_count) def data_load(self): tasks_info = dict() if self.atomicity: self.doc_load_using_txns() self.sleep(10) else: tasks_info = self.normal_doc_loader() self.sleep(10) return tasks_info def generate_docs(self): self.create_perc = self.input.param("create_perc",100) self.update_perc = self.input.param("update_perc", 10) self.delete_perc = self.input.param("delete_perc", 10) self.gen_delete_users = None self.gen_create_users = None self.gen_update_users = None if "update" in self.doc_ops: self.mutate += 1 self.gen_update_users = doc_generator("Users", 0, self.initial_load_count*self.update_perc/100, doc_size = self.doc_size, mutate = self.mutate) if "delete" in self.doc_ops: self.gen_delete_users = doc_generator("Users", self.start, self.start + (self.initial_load_count*self.delete_perc)/100, doc_size = self.doc_size) self._iter_count += 1 if "create" in self.doc_ops: self.start = self.end self.end += self.initial_load_count*self.create_perc/100 self.gen_create_users = doc_generator("Users", self.start, self.end, doc_size = self.doc_size) def data_validation_mode(self, tasks_info): # if not self.atomicity: self.data_validation(tasks_info) ''' else: for task in self.tasks: self.task.jython_task_manager.get_task_result(task) self.sleep(10) ''' def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format(bucket.name, dgm)) def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers(["Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted"]) if self._iter_count != 0: self.table.add_row([str(self.start - self.initial_load_count*self.delete_perc/100*(self._iter_count-1)), str(self.end- self.initial_load_count*self.delete_perc/100*self._iter_count), str(self.update_perc - self.update_perc) + "---" + str(self.initial_load_count*self.update_perc/100), str(self.start) + "---" + str(self.end), str(self.start - self.initial_load_count*self.create_perc/100) + "---" + str(self.start + (self.initial_load_count*self.delete_perc/100) - self.initial_load_count*self.create_perc/100)]) self.table.display("Docs statistics") def test_volume_taf(self): ######################################################################################################################## self.log.info("Step1: Create a n node cluster") nodes_init = self.cluster.servers[1:self.nodes_init] if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.query_node = self.cluster.master ######################################################################################################################## self.log.info("Step 2 & 3: Create required buckets.") bucket = self.create_required_buckets() self.loop = 0 ####################################################################################################################### while self.loop<self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.start = 0 self.bucket_util.add_rbac_user() self.end = self.initial_load_count = self.input.param("initial_load", 1000) initial_load = doc_generator("Users", self.start, self.start + self.initial_load_count, doc_size=self.doc_size) self.initial_data_load(initial_load) self.tasks = [] self.bucket_util.print_bucket_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs() self.gen_delete_users=None self._iter_count = 0 if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 1, nodes_out = 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 0, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 2, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in =1, nodes_out= 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## if "ephemeral" in self.bucket_type: self.log.info("No Memcached kill for epehemral bucket") else: self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.stop_process() self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 11: Failover a node and RebalanceOut that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load() self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) self.nodes = self.rest.node_statuses() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [node for node in self.cluster.servers if node.ip == self.chosen[0].ip] self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 12: Failover a node and FullRecovery that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 13: Failover a node and DeltaRecovery that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 15: Flush the bucket and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, int(len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.get_bucket_dgm(bucket) self._iter_count = 0 else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(bucket)