def test_chain_rebalance_out_cc(self): self.setup_for_test(skip_data_loading=True) self.ingestion_in_progress() total_cbas_nodes = len(self.otpNodes) while total_cbas_nodes > 1: cc_ip = self.cbas_util.retrieve_cc_ip(shell=self.shell) for otpnode in self.otpNodes: if otpnode.ip == cc_ip: self.cluster_util.remove_node([otpnode], wait_for_rebalance=True) for server in self.cbas_servers: if cc_ip != server.ip: self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, server) self.cbas_util.createConn("default") self.cbas_node = server break # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("Items before service restart: %s"%items_in_cbas_bucket) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info( "After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info( "Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0." ) else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) query = "select count(*) from {0};".format( self.cbas_dataset_name) self.cbas_util._run_concurrent_queries( query, "immediate", 10) break total_cbas_nodes -= 1 if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data()
def execute(self): try: from cbas.cbas_utils import cbas_utils utils = cbas_utils(self.master, self.cbas_server) utils.createConn(self.bucket) self.response, self.metrics, self.errors, self.results, self.handle = utils.execute_statement_on_cbas_util( self.statement) if self.response: self.state = CHECKING self.call() else: self.test_log.error("Some error in CBASQueryExecuteTask") traceback.print_exc(file=sys.stdout) self.state = FINISHED self.passed = False self.set_result(False) # catch and set all unexpected exceptions except Exception as e: self.state = FINISHED self.passed = False self.set_unexpected_exception(e)
def test_rebalance_kv_rollback_create_ops(self): self.setup_for_test() items_before_persistence_stop = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name)[0] self.log.info("Items in CBAS before persistence stop: %s" % items_before_persistence_stop) # Stop Persistence on Node A & Node B self.log.info("Stopping persistence on NodeA") mem_client = MemcachedClientHelper.direct_client( self.master, self.cb_bucket_name) mem_client.stop_persistence() # Perform Create, Update, Delete ops in the CB bucket self.log.info("Performing Mutations") self.perform_doc_ops_in_all_cb_buckets(self.num_items / 2, "create", self.num_items, self.num_items * 3 / 2) kv_nodes = self.get_kv_nodes(self.servers, self.master) items_in_cb_bucket = 0 if self.where_field and self.where_value: items_in_cb_bucket = RestConnection(self.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) # Validate no. of items in CBAS dataset self.assertTrue( self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, items_in_cb_bucket, 0), "No. of items in CBAS dataset do not match that in the CB bucket") # Count no. of items in CB & CBAS Buckets items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info( "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket" ) if self.CC: self.cluster_util.remove_node([self.otpNodes[0]], wait_for_rebalance=False) self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) self.cbas_util.createConn("default") else: self.cluster_util.remove_node([self.otpNodes[1]], wait_for_rebalance=False) # Kill memcached on Node A so that Node B becomes master self.log.info("Kill Memcached process on NodeA") shell = RemoteMachineShellConnection(self.master) shell.kill_memcached() self.sleep(2, "Wait for 2 secs for DCP rollback sent to CBAS.") curr = time.time() while items_in_cbas_bucket == -1 or ( items_in_cbas_bucket != 0 and items_in_cbas_bucket > items_before_persistence_stop): try: if curr + 120 < time.time(): break items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items in CBAS: %s" % items_in_cbas_bucket) except: self.log.info( "Probably rebalance is in progress and the reason for queries being failing." ) pass self.assertTrue(items_in_cbas_bucket <= items_before_persistence_stop, "Roll-back did not happen.") self.log.info("#######BINGO########\nROLLBACK HAPPENED") items_in_cb_bucket = 0 curr = time.time() while items_in_cb_bucket != items_in_cbas_bucket or items_in_cb_bucket == 0: items_in_cb_bucket = 0 items_in_cbas_bucket = 0 if self.where_field and self.where_value: try: items_in_cb_bucket = RestConnection( self.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] except: self.log.info( "Indexer in rollback state. Query failed. Pass and move ahead." ) pass else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) self.log.info("Items in CB bucket after rollback: %s" % items_in_cb_bucket) try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass if curr + 120 < time.time(): break str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: self.sleep(1) self.log.info("Waiting for rebalance to complete") self.log.info( "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "After Rollback : # Items in CBAS bucket does not match that in the CB bucket" )
def test_logging_configurations_are_restored_post_service_restarts(self): self.log.info("Add a cbas node") result = self.add_node(self.cbas_servers[0], services=["cbas"], rebalance=True) self.assertTrue(result, msg="Failed to add CBAS node") self.log.info("Delete all loggers") self.cbas_util.delete_all_loggers_on_cbas() self.log.info("Set the logging level using the json object") status, content, response = self.cbas_util.set_log_level_on_cbas( CbasLogging.DEFAULT_LOGGER_CONFIG_DICT) self.assertTrue(status, msg="Response status incorrect for SET request") self.log.info("Delete specific logger") logger_name = self.input.param("logger_name_to_delete", "com.couchbase.client.core.node") status, content, response = self.cbas_util.delete_specific_cbas_log_level( logger_name) self.assertTrue(status, msg="Status mismatch for DELETE") del CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[logger_name] self.log.info("Update specific logger") logger_name = self.input.param("logger_name_to_update", "org.apache.hyracks") logger_level_to_update = self.input.param("logger_level_to_update", "FATAL") status, response, content = self.cbas_util.set_specific_log_level_on_cbas( logger_name, logger_level_to_update) self.assertTrue(status, msg="Status mismatch for SET") CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[ logger_name] = logger_level_to_update self.log.info("Add a new logger") logger_name = self.input.param("logger_name_to_add", "org.apache.hyracks123") logger_level_to_add = self.input.param("logger_level_to_add", "ALL") status, response, content = self.cbas_util.set_specific_log_level_on_cbas( logger_name, logger_level_to_add) self.assertTrue(status, msg="Status mismatch for SET") CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[ logger_name] = logger_level_to_add self.log.info("Verify logging configuration that we set on cbas Node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = self.cbas_util.get_specific_cbas_log_level( name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info("Verify logging configuration on other cbas node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = cbas_utils( self.master, self.cbas_servers[0]).get_specific_cbas_log_level(name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.log.info("Read input params") process_name = self.input.param('process_name', None) service_name = self.input.param('service_name', None) restart_couchbase = self.input.param('restart_couchbase', False) reboot = self.input.param('reboot', False) kill_services = self.input.param('kill_services', False) self.log.info("Establish a remote connection") shell_cc = RemoteMachineShellConnection(self.cbas_node) shell_nc = RemoteMachineShellConnection(self.cbas_servers[0]) if kill_services: self.log.info("Kill the %s service on CC cbas node" % service_name) shell_cc.kill_process(process_name, service_name) self.log.info("Kill the %s service on other cbas node" % service_name) shell_nc.kill_process(process_name, service_name) if restart_couchbase: self.log.info("Restart couchbase CC node ") shell_cc.restart_couchbase() self.log.info("Restart couchbase NC node ") shell_nc.restart_couchbase() if reboot: self.log.info("Reboot couchbase CC node") NodeHelper.reboot_server(self.cbas_node, self) self.log.info("Reboot couchbase NC node") NodeHelper.reboot_server(self.cbas_servers[0], self) end_time = datetime.datetime.now() + datetime.timedelta(minutes=int(1)) self.log.info( "Wait for nodes to be bootstrapped, neglect the unreachable server exceptions" ) while datetime.datetime.now() < end_time: try: self.log.info("Get the logging configurations") status, content, response = self.cbas_util.get_log_level_on_cbas( ) self.assertTrue( status, msg="Response status incorrect for GET request") self.log.info("Convert response to a dictionary") log_dict = CbasLogging.convert_logger_get_result_to_a_dict( content) if len(log_dict) >= len( CbasLogging.DEFAULT_LOGGER_CONFIG_DICT): break except Exception as e: pass self.log.info("Verify logging configuration post service kill") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = self.cbas_util.get_specific_cbas_log_level( name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info( "Verify logging configuration on other cbas node post service kill" ) for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = cbas_utils( self.master, self.cbas_servers[0]).get_specific_cbas_log_level(name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name)
def test_logging_configurations_are_shared_across_cbas_node(self): self.log.info("Add a cbas node") result = self.add_node(self.cbas_servers[0], services=["cbas"], rebalance=True) self.assertTrue(result, msg="Failed to add CBAS node") self.log.info("Delete all loggers") self.cbas_util.delete_all_loggers_on_cbas() self.log.info( "Set the logging level using json object from default logger config dictionary on master cbas node" ) status, content, response = self.cbas_util.set_log_level_on_cbas( CbasLogging.DEFAULT_LOGGER_CONFIG_DICT) self.assertTrue(status, msg="Response status incorrect for SET request") self.log.info("Verify logging configuration that we set on cbas Node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = self.cbas_util.get_specific_cbas_log_level( name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info("Verify logging configuration on other cbas node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = cbas_utils( self.master, self.cbas_servers[0]).get_specific_cbas_log_level(name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.log.info("Update logging configuration on other cbas node") logger_level = self.input.param("logger_level", "FATAL") logger_name = self.input.param("logger_name", "org.apache.asterix") status, content, response = cbas_utils( self.master, self.cbas_servers[0]).set_specific_log_level_on_cbas( logger_name, logger_level) self.assertTrue(status, msg="Status mismatch for SET") self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info("Assert log level on master cbas node") status, content, response = self.cbas_util.get_specific_cbas_log_level( logger_name) self.assertTrue(status, msg="Status mismatch for GET") self.assertEquals(content, logger_level, msg="Logger configuration mismatch for " + logger_name)
def test_logging_configurations_are_restored_post_service_restarts(self): self.log.info("Add a cbas node") result = self.add_node(self.cbas_servers[0], services=["cbas"], rebalance=True) self.assertTrue(result, msg="Failed to add CBAS node") self.log.info("Delete all loggers") self.cbas_util.delete_all_loggers_on_cbas() self.log.info("Set the logging level using the json object") status, content, response = self.cbas_util.set_log_level_on_cbas( CbasLogging.DEFAULT_LOGGER_CONFIG_DICT) self.assertTrue(status, msg="Response status incorrect for SET request") self.log.info("Delete specific logger") logger_name = self.input.param("logger_name_to_delete", "com.couchbase.client.core.node") status, content, response = self.cbas_util.delete_specific_cbas_log_level( logger_name) self.assertTrue(status, msg="Status mismatch for DELETE") del CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[logger_name] self.log.info("Update specific logger") logger_name = self.input.param("logger_name_to_update", "org.apache.hyracks") logger_level_to_update = self.input.param("logger_level_to_update", "FATAL") status, response, content = self.cbas_util.set_specific_log_level_on_cbas( logger_name, logger_level_to_update) self.assertTrue(status, msg="Status mismatch for SET") CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[ logger_name] = logger_level_to_update self.log.info("Add a new logger") logger_name = self.input.param("logger_name_to_add", "org.apache.hyracks123") logger_level_to_add = self.input.param("logger_level_to_add", "ALL") status, response, content = self.cbas_util.set_specific_log_level_on_cbas( logger_name, logger_level_to_add) self.assertTrue(status, msg="Status mismatch for SET") CbasLogging.DEFAULT_LOGGER_CONFIG_DICT[ logger_name] = logger_level_to_add self.log.info("Verify logging configuration that we set on cbas Node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = self.cbas_util.get_specific_cbas_log_level( name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info("Verify logging configuration on other cbas node") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = cbas_utils( self.master, self.cbas_servers[0]).get_specific_cbas_log_level(name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.log.info("Read input params") process_name = self.input.param('process_name', None) service_name = self.input.param('service_name', None) restart_couchbase = self.input.param('restart_couchbase', False) reboot = self.input.param('reboot', False) kill_services = self.input.param('kill_services', False) self.log.info("Establish a remote connection") shell_cc = RemoteMachineShellConnection(self.cbas_node) shell_nc = RemoteMachineShellConnection(self.cbas_servers[0]) if kill_services: self.log.info("Kill the %s service on CC cbas node" % service_name) shell_cc.kill_process(process_name, service_name) self.log.info("Kill the %s service on other cbas node" % service_name) shell_nc.kill_process(process_name, service_name) if restart_couchbase: self.log.info("Restart couchbase CC node ") shell_cc.restart_couchbase() self.log.info("Restart couchbase NC node ") shell_nc.restart_couchbase() if reboot: self.log.info("Reboot couchbase CC node") NodeHelper.reboot_server(self.cbas_node, self) self.log.info("Reboot couchbase NC node") NodeHelper.reboot_server(self.cbas_servers[0], self) self.log.info( "Wait for request to complete and cluster to be active: Using private ping() function" ) cluster_recover_start_time = time.time() while time.time() < cluster_recover_start_time + 180: try: status, metrics, _, cbas_result, _ = self.cbas_util.execute_statement_on_cbas_util( "set `import-private-functions` `true`;ping();") if status == "success": break except: self.sleep(2, message="Wait for service to up again") self.log.info("Verify logging configuration post service kill") for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = self.cbas_util.get_specific_cbas_log_level( name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name) self.sleep( timeout=10, message= "Waiting for logger configuration to be copied across cbas nodes") self.log.info( "Verify logging configuration on other cbas node post service kill" ) for name, level in CbasLogging.DEFAULT_LOGGER_CONFIG_DICT.items(): status, content, response = cbas_utils( self.master, self.cbas_servers[0]).get_specific_cbas_log_level(name) self.assertTrue(status, msg="Response status incorrect for GET request") self.assertEquals(content, level, msg="Logger configuration mismatch for logger " + name)
def test_stop_network_ingest_data(self): self.setup_for_test() self.cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() # Add the code for stop network here: if self.cbas_node_type: if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) self.cbas_util.createConn("default") else: node_in_test = self.cbas_servers[0] # Stop network on KV node to mimic n/w partition on KV else: node_in_test = self.master items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Intems before network down: %s" % items_in_cbas_bucket_before) RemoteMachineShellConnection(node_in_test).stop_network("30") # self.sleep(40, "Wait for network to come up.") items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass # items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) self.log.info("Items after network is up: %s" % items_in_cbas_bucket) # start_time = time.time() # while items_in_cbas_bucket_after <=0 and time.time()<start_time+60: # items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.sleep(1) # items_in_cbas_bucket = items_in_cbas_bucket_after if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did not interrupted but restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" )
def test_disk_full_ingest_data(self): self.cbas_node_type = self.input.param('cbas_node_type', None) if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) else: node_in_test = self.cbas_servers[0] remote_client = RemoteMachineShellConnection(node_in_test) output, error = remote_client.execute_command("rm -rf full_disk*", use_channel=True) remote_client.log_command_output(output, error) self.setup_for_test() query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) def _get_disk_usage_in_MB(remote_client): disk_info = remote_client.get_disk_info(in_MB=True) disk_space = disk_info[1].split()[-3][:-1] return disk_space du = int(_get_disk_usage_in_MB(remote_client)) - 50 chunk_size = 1024 while int(du) > 0: output, error = remote_client.execute_command( "dd if=/dev/zero of=full_disk{0} bs={1}M count=1".format( str(du) + "_MB" + str(time.time()), chunk_size), use_channel=True) remote_client.log_command_output(output, error) du -= 1024 if du < 1024: chunk_size = du self.ingestion_in_progress() items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) try: while items_in_cbas_bucket_before != items_in_cbas_bucket_after: items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.sleep(2) items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: self.log.info("Ingestion interrupted and server seems to be down") if items_in_cbas_bucket_before == self.num_items * 3: self.log.info("Data Ingestion did not interrupted but completed.") elif items_in_cbas_bucket_before < self.num_items * 3: self.log.info("Data Ingestion Interrupted successfully") output, error = remote_client.execute_command("rm -rf full_disk*", use_channel=True) remote_client.log_command_output(output, error) remote_client.disconnect() self.sleep( 10, "wait for service to come up after disk space is made available.") run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") self.sleep(60) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) count_n1ql = self.rest.query_tool( 'select count(*) from `%s`' % (self.cb_bucket_name))['results'][0]['$1'] if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, count_n1ql): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" )
def test_stop_start_service_ingest_data(self): self.setup_for_test() self.cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) self.cbas_util.createConn("default") else: node_in_test = self.cbas_servers[0] items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items before service restart: %s" % items_in_cbas_bucket) self.log.info("Gracefully stopping service on node %s" % node_in_test) NodeHelper.stop_couchbase(node_in_test) NodeHelper.start_couchbase(node_in_test) NodeHelper.wait_service_started(node_in_test) # self.sleep(10, "wait for service to come up.") # # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("After graceful STOPPING/STARTING service docs in CBAS bucket : %s"%items_in_cbas_bucket) # # start_time = time.time() # while items_in_cbas_bucket <=0 and time.time()<start_time+60: # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.sleep(1) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did not interrupted but restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) count_n1ql = self.rest.query_tool( 'select count(*) from `%s`' % (self.cb_bucket_name))['results'][0]['$1'] if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, count_n1ql): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" )
def test_failover(self): self.setup_for_test(skip_data_loading=True) self.rebalance_node = self.input.param('rebalance_node', 'CC') self.how_many = self.input.param('how_many', 1) self.restart_rebalance = self.input.param('restart_rebalance', False) self.replica_change = self.input.param('replica_change', 0) self.add_back = self.input.param('add_back', False) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if self.rebalance_node == "CC": node_in_test = [self.cbas_node] otpNodes = [self.otpNodes[0]] self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) self.cbas_util.createConn("default") self.cbas_node = self.cbas_servers[0] elif self.rebalance_node == "NC": node_in_test = self.cbas_servers[:self.how_many] otpNodes = self.nc_otpNodes[:self.how_many] else: node_in_test = [self.cbas_node] + self.cbas_servers[:self.how_many] otpNodes = self.otpNodes[:self.how_many + 1] self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, self.cbas_servers[self.how_many]) self.cbas_util.createConn("default") replicas_before_rebalance = len( self.cbas_util.get_replicas_info(self.shell)) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("Items before failover node: %s" % items_in_cbas_bucket) if self.restart_rebalance: graceful_failover = self.input.param("graceful_failover", False) failover_task = self._cb_cluster.async_failover( self.input.servers, node_in_test, graceful_failover) failover_task.get_result() if self.add_back: for otpnode in otpNodes: self.rest.set_recovery_type('ns_1@' + otpnode.ip, "full") self.rest.add_back_node('ns_1@' + otpnode.ip) self.rebalance(wait_for_completion=False) else: self.rebalance(ejected_nodes=[node.id for node in otpNodes], wait_for_completion=False) self.sleep(2) if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") if self.add_back: self.rebalance(wait_for_completion=False) else: self.rebalance( ejected_nodes=[node.id for node in otpNodes], wait_for_completion=False) else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) else: graceful_failover = self.input.param("graceful_failover", False) failover_task = self._cb_cluster.async_failover( self.input.servers, node_in_test, graceful_failover) failover_task.get_result() if self.add_back: for otpnode in otpNodes: self.rest.set_recovery_type('ns_1@' + otpnode.ip, "full") self.rest.add_back_node('ns_1@' + otpnode.ip) self.rebalance(wait_for_completion=False) replicas_before_rebalance -= self.replica_change self.sleep(5) str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: replicas = self.cbas_util.get_replicas_info(self.shell) if replicas: for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.sleep(15) replicas = self.cbas_util.get_replicas_info(self.shell) replicas_after_rebalance = len(replicas) self.assertEqual( replicas_after_rebalance, replicas_before_rebalance, "%s,%s" % (replicas_after_rebalance, replicas_before_rebalance)) for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test[0]) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.rebalance_node == "NC": self.assertTrue(aborted_count == 0, "Some queries aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data()
def test_cc_swap_rebalance(self): self.restart_rebalance = self.input.param('restart_rebalance', False) self.setup_for_test(skip_data_loading=True) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() replicas_before_rebalance = len( self.cbas_util.get_replicas_info(self.shell)) self.cluster_util.add_node(node=self.cbas_servers[-1], rebalance=False) swap_nc = self.input.param('swap_nc', False) if not swap_nc: out_nodes = [self.otpNodes[0]] self.cbas_util.closeConn() self.cbas_util = cbas_utils(self.master, self.cbas_servers[0]) self.cbas_util.createConn("default") self.cbas_node = self.cbas_servers[0] else: out_nodes = [self.otpNodes[1]] self.cluster_util.remove_node(out_nodes, wait_for_rebalance=False) self.sleep(5, "Wait for sometime after rebalance started.") if self.restart_rebalance: if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") self.sleep(10) else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) self.rebalance(ejected_nodes=[node.id for node in out_nodes], wait_for_completion=False) self.sleep(5) str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: replicas = self.cbas_util.get_replicas_info(self.shell) if replicas: for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.sleep(20) replicas = self.cbas_util.get_replicas_info(self.shell) replicas_after_rebalance = len(replicas) self.assertEqual( replicas_after_rebalance, replicas_before_rebalance, "%s,%s" % (replicas_after_rebalance, replicas_before_rebalance)) for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("Items before service restart: %s"%items_in_cbas_bucket) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(self.master) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( self.master, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data()