def rebalance_swap(servers, how_many, monitor=True): if how_many < 1: log.error("failed to swap rebalance %s servers - invalid count" % how_many) return False, [] rest = RestConnection(servers[0]) cur_nodes = rest.node_statuses() cur_ips = map(lambda node: node.ip, cur_nodes) cur_ids = map(lambda node: node.id, cur_nodes) free_servers = filter(lambda server: server.ip not in cur_ips, servers) if len(cur_ids) <= how_many or len(free_servers) < how_many: log.error("failed to swap rebalance %s servers - not enough servers" % how_many) return False, [] ejections = cur_ids[-how_many:] additions = free_servers[:how_many] log.info("swap rebalance: cur: %s, eject: %s, add: %s" % (cur_ids, ejections, additions)) try: map( lambda server: rest.add_node( servers[0].rest_username, servers[0].rest_password, server.ip, server.port ), additions, ) except (ServerAlreadyJoinedException, ServerSelfJoinException, AddNodeException), e: log.error("failed to swap rebalance - addition failed %s: %s" % (additions, e)) return False, []
def test_oom_increase_mem_quota(self): """ 1. Get OOM 2. Drop Already existing indexes 3. Verify if state of indexes is changed to ready :return: """ self.assertTrue(self._push_indexer_off_the_cliff(), "OOM Can't be achieved") indexer_memQuota = self.get_indexer_mem_quota() log.info("Current Indexer Memory Quota is {}".format(indexer_memQuota)) for cnt in range(5): indexer_memQuota += 200 log.info("Increasing Indexer Memory Quota to {0}".format(indexer_memQuota)) rest = RestConnection(self.oomServer) rest.set_indexer_memoryQuota(indexMemoryQuota=indexer_memQuota) self.sleep(120) indexer_oom = self._validate_indexer_status_oom() if not indexer_oom: log.info("Indexer out of OOM...") break self.assertFalse(self._validate_indexer_status_oom(), "Indexer still in OOM") self.sleep(120) self._verify_bucket_count_with_index_count(self.load_query_definitions) self.multi_query_using_index(buckets=self.buckets, query_definitions=self.load_query_definitions)
def setUp(self): super(SecondaryIndexMemdbOomTests, self).setUp() self.indexMemQuota = self.input.param("indexMemQuota", 256) query_template = QUERY_TEMPLATE self.query_template = query_template.format("job_title") self.doc_ops = self.input.param("doc_ops", True) self.initial_index_number = self.input.param("initial_index_number", 2) self.oomServer = self.get_nodes_from_services_map(service_type="index") self.whereCondition= self.input.param("whereCondition"," job_title != \"Sales\" ") self.query_template += " WHERE {0}".format(self.whereCondition) rest = RestConnection(self.oomServer) if self.indexMemQuota > 256: log.info("Setting indexer memory quota to {0} MB...".format(self.indexMemQuota)) rest.set_indexer_memoryQuota(indexMemoryQuota=self.indexMemQuota) self.sleep(30) self.deploy_node_info = ["{0}:{1}".format(self.oomServer.ip, self.oomServer.port)] self.load_query_definitions = [] for x in range(self.initial_index_number): index_name = "index_name_"+str(x) query_definition = QueryDefinition(index_name=index_name, index_fields = ["job_title"], query_template = self.query_template, groups = ["simple"]) self.load_query_definitions.append(query_definition) self._initialize_multi_create_index(buckets = self.buckets, query_definitions = self.load_query_definitions, deploy_node_info=self.deploy_node_info) log.info("Setting indexer memory quota to 300 MB...") rest.set_indexer_memoryQuota(indexMemoryQuota=300) self.sleep(30)
def pick_node(master): rest = RestConnection(master) nodes = rest.node_statuses() node_picked = None nodes_on_same_ip = True firstIp = nodes[0].ip for node in nodes: if node.ip != firstIp: nodes_on_same_ip = False break for node in nodes: node_picked = node if not nodes_on_same_ip: if node_picked.ip != master.ip: log.info("Picked node ... {0}:{1}".format(node_picked.ip, node_picked.port)) break else: # temp fix - port numbers of master(machine ip and localhost: 9000 match if int(node_picked.port) == int(master.port): log.info( "Not picking the master node {0}:{1}.. try again...".format(node_picked.ip, node_picked.port) ) else: log.info("Picked node {0}:{1}".format(node_picked.ip, node_picked.port)) break return node_picked
def test_change_mem_quota_when_index_building(self): rest = RestConnection(self.oomServer) log.info("Setting indexer memory quota to 700 MB...") rest.set_indexer_memoryQuota(indexMemoryQuota=700) self.sleep(30) query_definitions = [] for x in range(3): index_name = "index_"+str(x) query_definition = QueryDefinition(index_name=index_name, index_fields = ["job_title"], query_template = self.query_template, groups = ["simple"]) query_definitions.append(query_definition) create_tasks = [] build_tasks = [] index_info = {} for bucket in self.buckets: if not bucket in index_info.keys(): index_info[bucket] = [] for query_definition in query_definitions: index_info[bucket].append(query_definition.index_name) task = self.async_create_index(bucket.name, query_definition) create_tasks.append(task) for task in create_tasks: task.result() if self.defer_build: log.info("Building Indexes...") for key, val in index_info.iteritems(): task = self.async_build_index(bucket=key, index_list=val) build_tasks.append(task) self.sleep(10) log.info("Setting indexer memory quota to 500 MB...") rest.set_indexer_memoryQuota(indexMemoryQuota=500) self.sleep(30) for task in build_tasks: task.result()
def test_zone_enable_after_upgrade_from_ce_to_ee(self): params = {} params['product'] = self.product params['version'] = self.version params['vbuckets'] = [self.vbuckets] params['type'] = self.type """ install couchbasse server community edition to run the test """ InstallerJob().parallel_install(self.servers[:3], params) params["type"] = "enterprise" zone_name = "AAABBBCCCaakkkkmmm345672" serverInfo = self.servers[0] ini_servers = self.servers[:self.nodes_init] rest = RestConnection(serverInfo) self.user = serverInfo.rest_username self.password = serverInfo.rest_password if len(ini_servers) > 1: self.cluster.rebalance([ini_servers[0]], ini_servers[1:], []) rest = RestConnection(self.master) self._bucket_creation() """ verify all nodes in cluster in CE """ if rest.is_enterprise_edition(): raise Exception("This test needs couchbase server community edition to run") self._load_all_buckets(self.servers[0], self.gen_load, "create", 0) try: self.log.info("create zone {0}".format(zone_name)) result = rest.add_zone(zone_name) if result: raise Exception("Zone feature should not be available in CE version") except Exception,e : if "Failed" in e: pass
def _verify_zone(self, name): serverInfo = self.servers[0] rest = RestConnection(serverInfo) if rest.is_zone_exist(name.strip()): self.log.info("verified! zone '{0}' is existed".format(name.strip())) else: raise Exception("There is not zone with name: %s in cluster" % name)
def rebalance_in_out_at_once_with_max_buckets_number(self): servs_init = self.servers[:self.nodes_init] servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)] servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)] rest = RestConnection(self.master) self._wait_for_stats_all_buckets(servs_init) self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()])) self.log.info("adding nodes {0} to cluster".format(servs_in)) self.log.info("removing nodes {0} from cluster".format(servs_out)) result_nodes = set(servs_init + servs_in) - set(servs_out) rest = RestConnection(self.master) bucket_num = rest.get_internalSettings("maxBucketCount") self.bucket_size = self.quota / bucket_num self.log.info('total %s buckets will be created with size %s MB' % (bucket_num, self.bucket_size)) self.cluster.create_default_bucket(self.master, self.bucket_size, self.num_replicas) self.buckets.append(Bucket(name="default", authType="sasl", saslPassword="", num_replicas=self.num_replicas, bucket_size=self.bucket_size)) self._create_sasl_buckets(self.master, (bucket_num - 1) / 2) self._create_standard_buckets(self.master, bucket_num - 1 - (bucket_num - 1) / 2) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) self._wait_for_stats_all_buckets(servs_init) rebalance = self.cluster.async_rebalance(servs_init, servs_in, servs_out) self._async_load_all_buckets(self.master, gen, "update", 0) rebalance.result() self.verify_cluster_stats(result_nodes)
def test_delete_empty_defautl_zone(self): zone_name ="test1" default_zone = "Group 1" moved_node = [] serverInfo = self.servers[0] moved_node.append(serverInfo.ip) rest = RestConnection(serverInfo) try: self.log.info("create zone {0}".format(zone_name)) rest.add_zone(zone_name) if rest.is_zone_exist(zone_name): self.log.info("Move node {0} from zone {1} to zone {2}" \ .format(moved_node, default_zone, zone_name)) status = rest.shuffle_nodes_in_zones(moved_node, default_zone, zone_name) if status: rest.delete_zone(default_zone) else: self.fail("Failed to move node {0} from zone {1} to zone {2}" \ .format(moved_node, default_zone, zone_name)) if not rest.is_zone_exist(default_zone): self.log.info("successful delete default zone") else: raise Exception("Failed to delete default zone") rest.rename_zone(zone_name, default_zone) except Exception,e : print e
def test_fileRotate20MB(self): auditIns = audit(host=self.master) firstEventTime = self.getTimeStampForFile(auditIns) tempEventCounter = 0 rest = RestConnection(self.master) shell = RemoteMachineShellConnection(self.master) filePath = auditIns.pathLogFile + auditIns.AUDITLOGFILENAME number = int (shell.get_data_file_size(filePath)) hostname = shell.execute_command("hostname") archiveFile = hostname[0][0] + '-' + firstEventTime + "-audit.log" result = shell.file_exists(auditIns.pathLogFile, archiveFile) tempTime = 0 starttime = time.time() while ((number < 21089520) and (tempTime < 36000) and (result == False)): for i in range(1, 10): status, content = rest.validateLogin("Administrator", "password", True, getContent=True) tempEventCounter += 1 number = int (shell.get_data_file_size(filePath)) currTime = time.time() tempTime = int (currTime - starttime) result = shell.file_exists(auditIns.pathLogFile, archiveFile) self.sleep(30) result = shell.file_exists(auditIns.pathLogFile, archiveFile) shell.disconnect() self.log.info ("--------Total Event Created ---- {0}".format(tempEventCounter)) self.assertTrue(result, "Archive Audit.log is not created on reaching 20MB threshhold")
def rebalance_in_out_at_once_persistence_stopped(self): num_nodes_with_stopped_persistence = self.input.param("num_nodes_with_stopped_persistence", 1) servs_init = self.servers[:self.nodes_init] servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)] servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)] rest = RestConnection(self.master) self._wait_for_stats_all_buckets(servs_init) for server in servs_init[:min(num_nodes_with_stopped_persistence, self.nodes_init)]: shell = RemoteMachineShellConnection(server) for bucket in self.buckets: shell.execute_cbepctl(bucket, "stop", "", "", "") self.sleep(5) self.num_items_without_persistence = self.input.param("num_items_without_persistence", 100000) gen_extra = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2\ , end=self.num_items / 2 + self.num_items_without_persistence) self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()])) self.log.info("adding nodes {0} to cluster".format(servs_in)) self.log.info("removing nodes {0} from cluster".format(servs_out)) tasks = self._async_load_all_buckets(self.master, gen_extra, "create", 0, batch_size=1000) result_nodes = set(servs_init + servs_in) - set(servs_out) # wait timeout in 60 min because MB-7386 rebalance stuck self.cluster.rebalance(servs_init[:self.nodes_init], servs_in, servs_out, timeout=self.wait_timeout * 60) for task in tasks: task.result() self._wait_for_stats_all_buckets(servs_init[:self.nodes_init - self.nodes_out], \ ep_queue_size=self.num_items_without_persistence * 0.9, ep_queue_size_cond='>') self._wait_for_stats_all_buckets(servs_in) self._verify_all_buckets(self.master, timeout=None) self._verify_stats_all_buckets(result_nodes) #verify that curr_items_tot corresponds to sum of curr_items from all nodes verified = True for bucket in self.buckets: verified &= RebalanceHelper.wait_till_total_numbers_match(self.master, bucket) self.assertTrue(verified, "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total")
def test_rotateIntervalCluster(self): intervalSec = self.input.param("intervalSec", None) nodes_init = self.input.param("nodes_init", 2) auditIns = audit(host=self.master) auditIns.setAuditEnable('true') originalInt = auditIns.getAuditRotateInterval() auditIns.setAuditRotateInterval(intervalSec) firstEventTime = [] try: for i in range(len(self.servers[:nodes_init])): auditTemp = audit(host=self.servers[i]) firstEventTime.append(self.getTimeStampForFile(auditTemp)) self.sleep(intervalSec + 20, 'Sleep for log roll over to happen') for i in range(len(self.servers[:nodes_init])): shell = RemoteMachineShellConnection(self.servers[i]) rest = RestConnection(self.servers[i]) status, content = rest.validateLogin(self.master.rest_username, self.master.rest_password, True, getContent=True) self.sleep(120, "sleeping for log file creation") try: hostname = shell.execute_command("hostname") self.log.info ("print firstEventTime {0}".format(firstEventTime[i])) archiveFile = hostname[0][0] + '-' + firstEventTime[i] + "-audit.log" self.log.info ("Archive File Name is {0}".format(archiveFile)) result = shell.file_exists(auditIns.pathLogFile, archiveFile) self.assertTrue(result, "Archive Audit.log is not created on time interval") self.log.info ("Validation of archive File created is True, Audit archive File is created {0}".format(archiveFile)) result = shell.file_exists(auditIns.pathLogFile, auditIns.AUDITLOGFILENAME) self.assertTrue(result, "Audit.log is not created as per the roll over time specified") finally: shell.disconnect() finally: auditIns.setAuditRotateInterval(originalInt)
def test_folderMisMatchCluster(self): auditIns = audit(host=self.master) orginalPath = auditIns.getAuditLogPath() newPath = originalPath + 'testFolderMisMatch' shell = RemoteMachineShellConnection(self.servers[0]) try: shell.create_directory(newPath) command = 'chown couchbase:couchbase ' + newPath shell.execute_command(command) finally: shell.disconnect() auditIns.setsetAuditLogPath(newPath) for server in self.servers: rest = RestConnection(sever) #Create an Event for Bucket Creation expectedResults = {'name':'TestBucket ' + server.ip, 'ram_quota':536870912, 'num_replicas':1, 'replica_index':False, 'eviction_policy':'value_only', 'type':'membase', \ 'auth_type':'sasl', "autocompaction":'false', "purge_interval":"undefined", \ "flush_enabled":False, "num_threads":3, "source":source, \ "user":user, "ip":self.ipAddress, "port":57457, 'sessionid':'' } rest.create_bucket(expectedResults['name'], expectedResults['ram_quota'] / 1048576, expectedResults['auth_type'], 'password', expectedResults['num_replicas'], \ '11211', 'membase', 0, expectedResults['num_threads'], expectedResults['flush_enabled'], 'valueOnly') #Check on Events try: self.checkConfig(self.eventID, self.servers[0], expectedResults) except: self.log.info ("Issue reading the file at Node {0}".format(server.ip))
def test_invalidLogPathCluster(self): auditIns = audit(host=self.master) newPath = auditIns.getAuditLogPath() + 'test' rest = RestConnection(self.master) status, content = rest.setAuditSettings(logPath=newPath) self.assertFalse(status, "Audit is able to set invalid path") self.assertEqual(content['errors']['logPath'], 'The value must be a valid directory', 'No error or error changed')
def test_rotateInterval(self): intervalSec = self.input.param("intervalSec", None) auditIns = audit(host=self.master) rest = RestConnection(self.master) originalInt = auditIns.getAuditRotateInterval() try: firstEventTime = self.getTimeStampForFile(auditIns) self.log.info ("first time evetn is {0}".format(firstEventTime)) auditIns.setAuditRotateInterval(intervalSec) self.sleep(intervalSec + 20, 'Sleep for log roll over to happen') status, content = rest.validateLogin(self.master.rest_username, self.master.rest_password, True, getContent=True) self.sleep(120) shell = RemoteMachineShellConnection(self.master) try: hostname = shell.execute_command("hostname") archiveFile = hostname[0][0] + '-' + firstEventTime + "-audit.log" self.log.info ("Archive File Name is {0}".format(archiveFile)) result = shell.file_exists(auditIns.pathLogFile, archiveFile) self.assertTrue(result, "Archive Audit.log is not created on time interval") self.log.info ("Validation of archive File created is True, Audit archive File is created {0}".format(archiveFile)) result = shell.file_exists(auditIns.pathLogFile, auditIns.AUDITLOGFILENAME) self.assertTrue(result, "Audit.log is not created when memcached server is killed") finally: shell.disconnect() finally: auditIns.setAuditRotateInterval(originalInt)
def test_root_crt_rotate_cluster(self): rest = RestConnection(self.master) x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(self.servers) rest.create_bucket(bucket='default', ramQuotaMB=100) self.sleep(30) servers_in = self.servers[1:] self.cluster.rebalance(self.servers, servers_in, []) for server in self.servers: result = self._sdk_connection(host_ip=server.ip) self.assertTrue(result,"Can create a ssl connection with correct certificate") result,cb = self._sdk_connection(host_ip=self.master.ip) create_docs = Thread(name='create_docs', target=self.createBulkDocuments, args=(cb,)) create_docs.start() x509main(self.master)._delete_inbox_folder() x509main(self.master)._generate_cert(self.servers,root_cn="CB\ Authority") x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(self.servers,reload_cert=True) create_docs.join() for server in self.servers: result = self._sdk_connection(host_ip=server.ip) self.assertTrue(result,"Can create a ssl connection with correct certificate")
def _create_buckets(self, nodes): master_node = nodes[0] num_buckets = 0 if self._default_bucket: num_buckets += 1 num_buckets += self._sasl_buckets + self._standard_buckets bucket_size = self._get_bucket_size(master_node, nodes, self._mem_quota_int, num_buckets) rest = RestConnection(master_node) master_id = rest.get_nodes_self().id if self._default_bucket: if self._default_quota != 0: bucket_size = self._default_quota rest = RestConnection(nodes[0]) rest.create_bucket( bucket=self.default_bucket_name, ramQuotaMB=bucket_size, replicaNumber=self._num_replicas, proxyPort=11211, authType="none", saslPassword=None, ) self._buckets.append(self.default_bucket_name) if self._sasl_buckets > 0: if self._sasl_quota != 0: bucket_size = self._sasl_quota self._create_sasl_buckets(master_node, master_id, bucket_size, password="******") if self._standard_buckets > 0: if self._standard_quota != 0: bucket_size = self._standard_quota self._create_standard_buckets(master_node, master_id, bucket_size)
def test_compare_views_all_nodes_x_docs(self): num_docs = self.helper.input.param("num-docs", 100) self.log.info("description : creates view on {0} documents, queries " "all nodes (not only the master node) and compares " "if the results are all the same"\ .format(num_docs)) design_name = "dev_test_compare_views_{0}_docs".format(num_docs) prefix = str(uuid.uuid4())[:7] inserted_keys = self._setup_index(design_name, num_docs, prefix) nodes = self.helper.rest.get_nodes() params = {"connection_timeout": 60000, "full_set": True} # Query every single node and verify for n in nodes: n_rest = RestConnection({ "ip": n.ip, "port": n.port, "username": self.helper.master.rest_username, "password": self.helper.master.rest_password}) results = n_rest.spatial_results(self.helper.bucket, design_name, params, None) result_keys = self.helper.get_keys(results) self.helper.verify_result(inserted_keys, result_keys)
def test_get_cluster_ca_self_signed(self): rest = RestConnection(self.master) rest.regenerate_cluster_certificate() status, content, header = x509main(self.master)._get_cluster_ca_cert() content = json.loads(content) self.assertTrue(status,"Issue while Cluster CA Cert") self.assertEqual(content['cert']['type'],"generated","Type of certificate is mismatch")
def items_verification(test, master): rest = RestConnection(master) # Verify items count across all node timeout = 600 for bucket in rest.get_buckets(): verified = RebalanceHelper.wait_till_total_numbers_match(master, bucket.name, timeout_in_seconds=timeout) test.assertTrue(verified, "Lost items!!.. failing test in {0} secs".format(timeout))
def setUp(self): super(XDCRTests, self).setUp() self.bucket = Bucket() self._initialize_nodes() self.master = self.servers[0] for server in self.servers: rest=RestConnection(server) cluster_status = rest.cluster_status() self.log.info("Initial status of {0} cluster is {1}".format(server.ip, cluster_status['nodes'][0]['status'])) while cluster_status['nodes'][0]['status'] == 'warmup': self.log.info("Waiting for cluster to become healthy") self.sleep(5) cluster_status = rest.cluster_status() self.log.info("current status of {0} is {1}".format(server.ip, cluster_status['nodes'][0]['status'])) # Delete all buckets before creating new buckets self.log.info("Deleting all existing buckets") BucketOperationHelper.delete_all_buckets_or_assert(self.servers, self) self.log.info("Creating new buckets") src_bucket = self.input.param('src_bucket', self.bucket) dest_bucket = self.input.param('dest_bucket', self.bucket) if src_bucket: RestConnection(self.servers[0]).create_bucket(bucket='default', ramQuotaMB=500) if dest_bucket: RestConnection(self.servers[1]).create_bucket(bucket='default', ramQuotaMB=500) helper = BaseHelper(self) helper.login()
def verifyLdapSettings(self, server, admins, ro_admins, default, enabled): rest = RestConnection(server) settings = rest.ldapRestOperationGetResponse() if admins is None: admins = [] else: admins = admins.split(",") if ro_admins is None: ro_admins = [] else: ro_admins = ro_admins.split(",") if str(enabled) == "0": admins = [] ro_admins = [] if default == "admins" and str(enabled) == "1": if settings["admins"] != "asterisk": log.info("Admins don't match (%s vs asterisk)", settings["admins"]) return False elif not self._list_compare(settings["admins"], admins): log.info("Admins don't match (%s vs %s)", settings["admins"], admins) return False if default == "roadmins" and str(enabled) == "1": if settings["roAdmins"] != "asterisk": log.info("Read only admins don't match (%s vs asterisk)", settings["roAdmins"]) return False elif not self._list_compare(settings["roAdmins"], ro_admins): log.info("Read only admins don't match (%s vs %s)", settings["roAdmins"], ro_admins) return False return True
def verifyIndexSettings(self, server, max_rollbacks, stable_snap_interval, mem_snap_interval, storage_mode, threads, log_level): rest = RestConnection(server) settings = rest.get_global_index_settings() if storage_mode == "default": storage_mode = "forestdb" elif storage_mode == "memopt": storage_mode = "memory_optimized" if max_rollbacks and str(settings["maxRollbackPoints"]) != str(max_rollbacks): log.info("Max rollbacks does not match (%s vs. %s)", str(settings["maxRollbackPoints"]), str(max_rollbacks)) return False if stable_snap_interval and str(settings["stableSnapshotInterval"]) != str(stable_snap_interval): log.info("Stable snapshot interval does not match (%s vs. %s)", str(settings["stableSnapshotInterval"]), str(stable_snap_interval)) return False if mem_snap_interval and str(settings["memorySnapshotInterval"]) != str(mem_snap_interval): log.info("Memory snapshot interval does not match (%s vs. %s)", str(settings["memorySnapshotInterval"]), str(mem_snap_interval)) return False if storage_mode and str(settings["storageMode"]) != str(storage_mode): log.info("Storage mode does not match (%s vs. %s)", str(settings["storageMode"]), str(storage_mode)) return False if threads and str(settings["indexerThreads"]) != str(threads): log.info("Threads does not match (%s vs. %s)", str(settings["indexerThreads"]), str(threads)) return False if log_level and str(settings["logLevel"]) != str(log_level): log.info("Log level does not match (%s vs. %s)", str(settings["logLevel"]), str(log_level)) return False return True
def common_test_body(self, replica, load_ratio, timeout=10): log = logger.Logger.get_logger() start_time = time.time() log.info("replica : {0}".format(replica)) log.info("load_ratio : {0}".format(load_ratio)) master = self._servers[0] log.info('picking server : {0} as the master'.format(master)) rest = RestConnection(master) while time.time() < ( start_time + 60 * timeout): #rebalance out step nodes #let's add some items ? nodes = rest.node_statuses() delta = len(self._servers) - len(nodes) if delta > 0: if delta > 1: how_many_add = Random().randint(1, delta) else: how_many_add = 1 self.log.info("going to add {0} nodes".format(how_many_add)) self.rebalance_in(how_many=how_many_add) else: self.log.info("all nodes already joined the cluster") time.sleep(30 * 60) #dont rebalance out if there are not too many nodes if len(nodes) >= (3.0 / 4.0 * len(self._servers)): nodes = rest.node_statuses() how_many_out = Random().randint(1, len(nodes) - 1) self.log.info("going to remove {0} nodes".format(how_many_out)) self.rebalance_out(how_many=how_many_out)
def verifyServices(self, server, expected_services): """Verifies that the services on a given node match the expected service Options: server - A TestInputServer object of the server to connect to expected_services - A comma separated list of services Returns a boolean corresponding to whether or not the expected services are available on the server. """ rest = RestConnection(server) hostname = "%s:%s" % (server.ip, server.port) expected_services = expected_services.replace("data", "kv") expected_services = expected_services.replace("query", "n1ql") expected_services = expected_services.split(",") nodes_services = rest.get_nodes_services() for node, services in nodes_services.iteritems(): if node.encode('ascii') == hostname: if len(services) != len(expected_services): log.info("Services on %s do not match expected services (%s vs. %s)", hostname, services, expected_services) return False for service in services: if service.encode("ascii") not in expected_services: log.info("Services on %s do not match expected services (%s vs. %s)", hostname, services, expected_services) return False return True log.info("Services on %s not found, the server may not exist", hostname) return False
def create_bucket(serverInfo, name='default', replica=1, port=11210, test_case=None, bucket_ram=-1, password=None): log = logger.Logger.get_logger() rest = RestConnection(serverInfo) if bucket_ram < 0: info = rest.get_nodes_self() bucket_ram = info.memoryQuota * 2 / 3 if password == None: authType = "sasl" else: authType = "none" rest.create_bucket(bucket=name, ramQuotaMB=bucket_ram, replicaNumber=replica, proxyPort=port, authType=authType, saslPassword=password) msg = 'create_bucket succeeded but bucket "{0}" does not exist' bucket_created = BucketOperationHelper.wait_for_bucket_creation(name, rest) if not bucket_created: log.error(msg) if test_case: test_case.fail(msg=msg.format(name)) return bucket_created
def backup(self): while True: try: x = self.queue.get_nowait() self.log.info("get_nowait : {0}".format(x)) break #things are notmal just do another back aafter #waiting for self.interval except Exception: master = self.servers[0] rest = RestConnection(master) nodes = rest.node_statuses() map = self.node_server_map(nodes, self.servers) self.log.info("cluster has {0} nodes".format(len(nodes))) for node in nodes: try: from Crypto.Random import atfork atfork() BackupHelper(map[node]).backup('default', "/tmp") BackupHelper(map[node]).backup('default', "/tmp") except Exception as ex: print ex self.log.info("backed up the data into ") time.sleep(self.interval)
def verifyRecoveryType(self, server, recovery_servers, recovery_type): rest = RestConnection(server) settings = rest.get_all_zones_info() if not settings or "groups" not in settings: log.info("Group settings payload appears to be invalid") return False if not recovery_servers: return True num_found = 0 recovery_servers = recovery_servers.split(",") for group in settings["groups"]: for node in group["nodes"]: for rs in recovery_servers: if node["hostname"] == rs: if node["recoveryType"] != recovery_type: log.info("Node %s doesn't contain recovery type %s ", rs, recovery_type) return False else: num_found = num_found + 1 if num_found == len(recovery_servers): return True log.info("Node `%s` not found in nodes list", ",".join(recovery_servers)) return False
def create_primary_index_for_3_0_and_greater(self): self.log.info("CREATE PRIMARY INDEX using %s" % self.primary_indx_type) rest = RestConnection(self.master) versions = rest.get_nodes_versions() if versions[0].startswith("4") or versions[0].startswith("3") or versions[0].startswith("5"): for bucket in self.buckets: if self.primary_indx_drop: self.log.info("Dropping primary index for %s using %s ..." % (bucket.name,self.primary_indx_type)) self.query = "DROP PRIMARY INDEX ON %s USING %s" % (bucket.name,self.primary_indx_type) #self.run_cbq_query() self.sleep(3, 'Sleep for some time after index drop') self.query = 'select * from system:indexes where name="#primary" and keyspace_id = "%s"' % bucket.name res = self.run_cbq_query() self.sleep(10) if self.monitoring: self.query = "delete from system:completed_requests" self.run_cbq_query() if not self.skip_primary_index: if (res['metrics']['resultCount'] == 0): self.query = "CREATE PRIMARY INDEX ON %s USING %s" % (bucket.name, self.primary_indx_type) self.log.info("Creating primary index for %s ..." % bucket.name) try: self.run_cbq_query() self.primary_index_created = True if self.primary_indx_type.lower() == 'gsi': self._wait_for_index_online(bucket, '#primary') except Exception, ex: self.log.info(str(ex))
def setUp(self): super(RbacTestMemcached, self).setUp() rest = RestConnection(self.master) self.auth_type = self.input.param('auth_type','builtin') self.user_id = self.input.param("user_id",None) self.user_role = self.input.param("user_role",None) self.bucket_name = self.input.param("bucket_name",None) rest.create_bucket(bucket=self.bucket_name, ramQuotaMB=100,lww=True) self.role_map = self.input.param("role_map",None) self.incorrect_bucket = self.input.param("incorrect_bucket",False) self.new_role = self.input.param("new_role",None) self.new_role_map = self.input.param("new_role_map",None) self.no_bucket_access = self.input.param("no_bucket_access",False) self.no_access_bucket_name = self.input.param("no_access_bucket_name","noaccess") self.all_buckets = self.input.param("all_buckets",None) self.ldap_users = rbacmain().returnUserList(self.user_id) if self.no_bucket_access: rest.create_bucket(bucket=self.no_access_bucket_name, ramQuotaMB=100, lww=True) if self.auth_type == 'ldap': rbacmain(self.master, 'builtin')._delete_user('cbadminbucket') if self.auth_type == 'ldap': rbacmain().setup_auth_mechanism(self.servers,'ldap',rest) for user in self.ldap_users: testuser = [{'id': user[0], 'name': user[0], 'password': user[1]}] RbacBase().create_user_source(testuser, 'ldap', self.master) self.sleep(10) elif self.auth_type == "pam": rbacmain().setup_auth_mechanism(self.servers,'pam', rest) rbacmain().add_remove_local_user(self.servers, self.ldap_users, 'deluser') rbacmain().add_remove_local_user(self.servers, self.ldap_users,'adduser') elif self.auth_type == "builtin": for user in self.ldap_users: testuser = [{'id': user[0], 'name': user[0], 'password': user[1]}] RbacBase().create_user_source(testuser, 'builtin', self.master) self.sleep(10)
def wait_for_vbuckets_ready_state(node, bucket, timeout_in_seconds=300, log_msg='', admin_user='******', admin_pass='******'): log = logger.Logger.get_logger() start_time = time.time() end_time = start_time + timeout_in_seconds ready_vbuckets = {} rest = RestConnection(node) servers = rest.get_nodes() RestHelper(rest).vbucket_map_ready(bucket, 60) vbucket_count = len(rest.get_vbuckets(bucket)) vbuckets = rest.get_vbuckets(bucket) obj = VBucketAwareMemcached(rest, bucket) memcacheds, vbucket_map, vbucket_map_replica = obj.request_map( rest, bucket) #Create dictionary with key:"ip:port" and value: a list of vbuckets server_dict = defaultdict(list) for everyID in range(0, vbucket_count): memcached_ip_port = str(vbucket_map[everyID]) server_dict[memcached_ip_port].append(everyID) while time.time() < end_time and len(ready_vbuckets) < vbucket_count: for every_ip_port in server_dict: #Retrieve memcached ip and port ip = every_ip_port.rsplit(":", 1)[0] port = every_ip_port.rsplit(":", 1)[1] client = MemcachedClient(ip, int(port), timeout=30) client.vbucket_count = len(vbuckets) bucket_info = rest.get_bucket(bucket) cluster_compatibility = rest.check_cluster_compatibility("5.0") if cluster_compatibility is None: pre_spock = True else: pre_spock = not cluster_compatibility if pre_spock: log.info("Atleast 1 of the server is on pre-spock " "version. Using the old ssl auth to connect to " "bucket.") client.sasl_auth_plain( bucket_info.name.encode('ascii'), bucket_info.saslPassword.encode('ascii')) else: client.sasl_auth_plain(admin_user, admin_pass) bucket = bucket.encode('ascii') client.bucket_select(bucket) for i in server_dict[every_ip_port]: try: (a, b, c) = client.get_vbucket_state(i) except mc_bin_client.MemcachedError as e: ex_msg = str(e) if "Not my vbucket" in log_msg: log_msg = log_msg[:log_msg.find("vBucketMap") + 12] + "..." if e.status == memcacheConstants.ERR_NOT_MY_VBUCKET: # May receive this while waiting for vbuckets, continue and retry...S continue log.error("%s: %s" % (log_msg, ex_msg)) continue except exceptions.EOFError: # The client was disconnected for some reason. This can # happen just after the bucket REST API is returned (before # the buckets are created in each of the memcached processes.) # See here for some details: http://review.couchbase.org/#/c/49781/ # Longer term when we don't disconnect clients in this state we # should probably remove this code. log.error( "got disconnected from the server, reconnecting") client.reconnect() client.sasl_auth_plain( bucket_info.name.encode('ascii'), bucket_info.saslPassword.encode('ascii')) continue if c.find("\x01") > 0 or c.find("\x02") > 0: ready_vbuckets[i] = True elif i in ready_vbuckets: log.warning( "vbucket state changed from active to {0}".format( c)) del ready_vbuckets[i] client.close() return len(ready_vbuckets) == vbucket_count
def _log_start(self): try: msg = "{0} : {1} started ".format(datetime.datetime.now(), self._testMethodName) RestConnection(self.servers[0]).log_client_error(msg) except: pass
def getOtpNodeIds(master): rest = RestConnection(master) nodes = rest.node_statuses() otpNodeIds = [node.id for node in nodes] return otpNodeIds
def rebalance_in(servers, how_many, do_shuffle=True, monitor=True, do_check=True): servers_rebalanced = [] log = logger.Logger.get_logger() rest = RestConnection(servers[0]) nodes = rest.node_statuses() #are all ips the same nodes_on_same_ip = True firstIp = nodes[0].ip if len(nodes) == 1: nodes_on_same_ip = False else: for node in nodes: if node.ip != firstIp: nodes_on_same_ip = False break nodeIps = ["{0}:{1}".format(node.ip, node.port) for node in nodes] log.info("current nodes : {0}".format(nodeIps)) toBeAdded = [] master = servers[0] selection = servers[1:] if do_shuffle: shuffle(selection) for server in selection: if nodes_on_same_ip: if not "{0}:{1}".format(firstIp, server.port) in nodeIps: toBeAdded.append(server) servers_rebalanced.append(server) log.info("choosing {0}:{1}".format(server.ip, server.port)) elif not "{0}:{1}".format(server.ip, server.port) in nodeIps: toBeAdded.append(server) servers_rebalanced.append(server) log.info("choosing {0}:{1}".format(server.ip, server.port)) if len(toBeAdded) == int(how_many): break if do_check and len(toBeAdded) < how_many: raise Exception( "unable to find {0} nodes to rebalance_in".format(how_many)) for server in toBeAdded: otpNode = rest.add_node(master.rest_username, master.rest_password, server.ip, server.port) otpNodes = [node.id for node in rest.node_statuses()] started = rest.rebalance(otpNodes, []) msg = "rebalance operation started ? {0}" log.info(msg.format(started)) if monitor is not True: return True, servers_rebalanced if started: try: result = rest.monitorRebalance() except RebalanceFailedException as e: log.error("rebalance failed: {0}".format(e)) return False, servers_rebalanced msg = "successfully rebalanced in selected nodes from the cluster ? {0}" log.info(msg.format(result)) return result, servers_rebalanced return False, servers_rebalanced
def verify_items_count(master, bucket, num_attempt=3, timeout=2): #get the #of buckets from rest rest = RestConnection(master) if isinstance(bucket, Bucket): bucket = bucket.name bucket_info = rest.get_bucket(bucket, num_attempt, timeout) replica_factor = bucket_info.numReplicas vbucket_active_sum = 0 vbucket_replica_sum = 0 vbucket_pending_sum = 0 all_server_stats = [] stats_received = True nodes = rest.get_nodes() for server in nodes: #get the stats server_stats = rest.get_bucket_stats_for_node(bucket, server) if not server_stats: log.info("unable to get stats from {0}:{1}".format( server.ip, server.port)) stats_received = False all_server_stats.append((server, server_stats)) if not stats_received: raise StatsUnavailableException() sum = 0 for server, single_stats in all_server_stats: if not single_stats or "curr_items" not in single_stats: continue sum += single_stats["curr_items"] log.info("curr_items from {0}:{1} : {2}".format(server.ip, server.port, \ single_stats["curr_items"])) if 'vb_pending_num' in single_stats: vbucket_pending_sum += single_stats['vb_pending_num'] log.info( "vb_pending_num from {0}:{1} : {2}".format(server.ip, server.port, \ single_stats["vb_pending_num"])) if 'vb_active_num' in single_stats: vbucket_active_sum += single_stats['vb_active_num'] log.info( "vb_active_num from {0}:{1} : {2}".format(server.ip, server.port, \ single_stats["vb_active_num"])) if 'vb_replica_num' in single_stats: vbucket_replica_sum += single_stats['vb_replica_num'] log.info( "vb_replica_num from {0}:{1} : {2}".format(server.ip, server.port, \ single_stats["vb_replica_num"])) msg = "summation of vb_active_num : {0} vb_pending_num : {1} vb_replica_num : {2}" log.info( msg.format(vbucket_active_sum, vbucket_pending_sum, vbucket_replica_sum)) msg = 'sum : {0} and sum * (replica_factor + 1) ({1}) : {2}' log.info( msg.format(sum, replica_factor + 1, (sum * (replica_factor + 1)))) master_stats = rest.get_bucket_stats(bucket) if "curr_items_tot" in master_stats: log.info('curr_items_tot from master: {0}'.format( master_stats["curr_items_tot"])) else: raise Exception( "bucket {O} stats doesnt contain 'curr_items_tot':".format( bucket)) if replica_factor >= len(nodes): log.warn("the number of nodes is less than replica requires") delta = sum * (len(nodes)) - master_stats["curr_items_tot"] else: delta = sum * (replica_factor + 1) - master_stats["curr_items_tot"] delta = abs(delta) if delta > 0: if sum == 0: missing_percentage = 0 else: missing_percentage = delta * 1.0 / (sum * (replica_factor + 1)) log.info("Nodes stats are: {0}".format([node.ip for node in nodes])) else: missing_percentage = 1 log.info("delta : {0} missing_percentage : {1} replica_factor : {2}".format(delta, \ missing_percentage, replica_factor)) # If no items missing then, return True if not delta: return True return False
def wait_for_mc_stats_all_nodes(master, bucket, stat_key, stat_value, timeout_in_seconds=120, verbose=True): log.info("waiting for bucket {0} stat : {1} to match {2} on {3}".format(bucket, stat_key, \ stat_value, master.ip)) time_to_timeout = 0 previous_stat_value = -1 curr_stat_value = -1 verified = False all_stats = {} while not verified: rest = RestConnection(master) nodes = rest.node_statuses() for node in nodes: _server = { "ip": node.ip, "port": node.port, "username": master.rest_username, "password": master.rest_password } #failed over node is part of node_statuses but since its failed over memcached connections #to this node will fail node_self = RestConnection(_server).get_nodes_self() if node_self.clusterMembership == 'active': mc = MemcachedClientHelper.direct_client(_server, bucket) n_stats = mc.stats("") mc.close() all_stats[node.id] = n_stats actual_stat_value = -1 for k in all_stats: if all_stats[k] and stat_key in all_stats[k]: if actual_stat_value == -1: log.info(all_stats[k][stat_key]) actual_stat_value = int(all_stats[k][stat_key]) else: actual_stat_value += int(all_stats[k][stat_key]) if actual_stat_value == stat_value: log.info("{0} : {1}".format(stat_key, actual_stat_value)) verified = True break else: if verbose: log.info("{0} : {1}".format(stat_key, actual_stat_value)) curr_stat_value = actual_stat_value # values are changing so clear any timeout if curr_stat_value != previous_stat_value: time_to_timeout = 0 else: if time_to_timeout == 0: time_to_timeout = time.time() + timeout_in_seconds if time_to_timeout < time.time(): log.info( "no change in {0} stat after {1} seconds (value = {2})" .format(stat_key, timeout_in_seconds, curr_stat_value)) break previous_stat_value = curr_stat_value if not verbose: time.sleep(0.1) else: time.sleep(2) return verified
def test_rollback(self): bucket = self.src_cluster.get_buckets()[0] src_nodes = self.src_cluster.get_nodes() dest_nodes = self.dest_cluster.get_nodes() nodes = src_nodes + dest_nodes # Stop Persistence on Node A & Node B for node in nodes: mem_client = MemcachedClientHelper.direct_client(node, bucket) mem_client.stop_persistence() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.setup_xdcr() self.src_cluster.pause_all_replications() self.dest_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) gen = BlobGenerator("C2-", "C2-", self._value_size, end=self._num_items) self.dest_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self.dest_cluster.resume_all_replications() # Perform mutations on the bucket self.async_perform_update_delete() rest1 = RestConnection(self.src_cluster.get_master_node()) rest2 = RestConnection(self.dest_cluster.get_master_node()) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info( "Before rollback src cluster count = {0} dest cluster count = {1}". format(_count1, _count2)) # Kill memcached on Node A so that Node B becomes master shell = RemoteMachineShellConnection( self.src_cluster.get_master_node()) shell.kill_memcached() shell = RemoteMachineShellConnection( self.dest_cluster.get_master_node()) shell.kill_memcached() # Start persistence on Node B mem_client = MemcachedClientHelper.direct_client(src_nodes[1], bucket) mem_client.start_persistence() mem_client = MemcachedClientHelper.direct_client(dest_nodes[1], bucket) mem_client.start_persistence() # Failover Node B failover_task = self.src_cluster.async_failover() failover_task.result() failover_task = self.dest_cluster.async_failover() failover_task.result() # Wait for Failover & rollback to complete self.sleep(60) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info( "After rollback src cluster count = {0} dest cluster count = {1}". format(_count1, _count2)) self.assertTrue( self.src_cluster.wait_for_outbound_mutations(), "Mutations in source cluster not replicated to target after rollback" ) self.assertTrue( self.dest_cluster.wait_for_outbound_mutations(), "Mutations in target cluster not replicated to source after rollback" ) count = NodeHelper.check_goxdcr_log( src_nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count, 0, "rollback did not happen as expected") self.log.info("rollback happened as expected") count = NodeHelper.check_goxdcr_log( dest_nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count, 0, "rollback did not happen as expected") self.log.info("rollback happened as expected")
def xdcr_link_cluster(src_master, dest_master, dest_cluster_name): rest_conn_src = RestConnection(src_master) rest_conn_src.add_remote_cluster(dest_master.ip, dest_master.port, dest_master.rest_username, dest_master.rest_password, dest_cluster_name)
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp() self.server_map = self.get_server_map() def tearDown(self): super(FailoverTests, self).tearDown() def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def test_failover_then_add_back_and_rebalance_in(self): self.add_back_flag = True self.common_test_body('normal', rebalance_type="in") def test_failover_then_add_back_and_rebalance_out(self): self.add_back_flag = True self.common_test_body('normal', rebalance_type="out") def test_failover_then_add_back_and_swap_rebalance(self): self.add_back_flag = True self.common_test_body('normal', rebalance_type="swap") def common_test_body(self, failover_reason, rebalance_type=None): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Variable to decide the durability outcome durability_will_fail = False # Variable to track the number of nodes failed num_nodes_failed = 1 # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 \ and (self.graceful or self.recoveryType is not None): self.log.error( "Can't apply graceful failover to nodes with version < 3.*") self.log.error("Please check configuration params: SKIPPING TEST") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and # check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validation record_static_data_set = {} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) target_bucket = self.bucket_util.buckets[0] # Update new_replica value, if provided in the conf if self.new_replica: self.num_replicas = self.new_replica bucket_helper = BucketHelper(self.master) bucket_helper.change_bucket_props(target_bucket.name, replicaNumber=self.num_replicas) # Decide whether the durability is going to fail or not if self.num_failed_nodes >= 1 and self.num_replicas > 1: durability_will_fail = True # Construct target vbucket list from the nodes # which are going to be failed over vbucket_list = list() for target_node in self.chosen: for server in self.servers: if server.ip == target_node.ip: # Comment out the break once vbucket_list method is fixed break shell_conn = RemoteMachineShellConnection(server) cb_stats = Cbstats(shell_conn) vbuckets = cb_stats.vbucket_list(target_bucket.name, self.target_vbucket_type) shell_conn.disconnect() vbucket_list += vbuckets # Code to generate doc_loaders that will work on vbucket_type # based on targeted nodes. This will perform CRUD only on # vbuckets which will be affected by the failover self.gen_create = doc_generator(self.key, self.num_items, self.num_items * 1.5, target_vbucket=vbucket_list) self.gen_update = doc_generator(self.key, self.num_items / 2, self.num_items, target_vbucket=vbucket_list) self.gen_delete = doc_generator(self.key, self.num_items / 4, self.num_items / 2 - 1, target_vbucket=vbucket_list) self.afterfailover_gen_create = doc_generator( self.key, self.num_items * 1.6, self.num_items * 2, target_vbucket=vbucket_list) self.afterfailover_gen_update = doc_generator( self.key, 1, self.num_items / 4, target_vbucket=vbucket_list) self.afterfailover_gen_delete = doc_generator( self.key, self.num_items * 0.5, self.num_items * 0.75, target_vbucket=vbucket_list) # Perform Add Back Operation with Rebalance # or only Rebalance with verifications if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.failover_onebyone: # Reset it back to False durability_will_fail = False for node_chosen in self.chosen: if num_nodes_failed > 1: durability_will_fail = True if self.add_back_flag: # In add-back case, durability should never fail, since # the num_nodes in the cluster will remain the same self.run_add_back_operation_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) num_nodes_failed += 1 else: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) else: return # Will verify_unacked_bytes only if the durability is not going to fail if self.during_ops is None and not durability_will_fail: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=False): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self.bucket_util._wait_for_stats_all_buckets( check_ep_items_remaining=True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Perform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}" \ .format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.bucket_util.verify_cluster_stats(self.num_items, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.bucket_util.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, # currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.bucket_util.compare_failovers_logs( prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.bucket_util.compare_vbucket_seqnos( prev_vbucket_stats, _servers_, self.buckets) self.bucket_util.compare_vbucketseq_failoverlogs( new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.buckets, num_replicas=self.num_replicas, total_vbuckets=self.total_vbuckets, std=20.0) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=False, rebalance_type=None): """ Method to run add-back operation with recovery type = (delta/full). It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self.bucket_util._wait_for_stats_all_buckets( check_ep_items_remaining=True) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map) index = 0 for node in chosen: self.rest.add_back_node(node.id) self.sleep(5) if self.recoveryType: # define precondition for recovery type self.rest.set_recovery_type( otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Perform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) if rebalance_type == "in": rebalance = self.task.rebalance(self.servers[:self.nodes_init], [self.servers[self.nodes_init]], []) if rebalance_type == "out": rebalance = self.task.rebalance( self.servers[:self.nodes_init], [], [self.servers[self.nodes_init - 1]]) if rebalance_type == "swap": self.rest.add_node(self.master.rest_username, self.master.rest_password, self.servers[self.nodes_init].ip, self.servers[self.nodes_init].port, services=["kv"]) rebalance = self.task.rebalance( self.servers[:self.nodes_init], [], [self.servers[self.nodes_init - 1]]) # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}" \ .format(chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue & make sure that intra-cluster replication is complete self.bucket_util._wait_for_stats_all_buckets( check_ep_items_remaining=True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.bucket_util.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps: self.sleep(60) self.bucket_util.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* & if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.bucket_util.compare_vbucket_seqnos( prev_vbucket_stats, self.servers, self.buckets, perNode=False) new_failover_stats = self.bucket_util.compare_failovers_logs( prev_failover_stats, self.servers, self.buckets) self.bucket_util.compare_vbucketseq_failoverlogs( new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.buckets, num_replicas=self.num_replicas, total_vbuckets=self.total_vbuckets, std=20.0) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations related to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.cluster_util.stop_server(node) self.log.info("10 secs delay for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10), msg="node status is healthy even after waiting for 5 mins") elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type( node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes \ or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(60) success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}" \ .format(node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}"\ .format(node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse( failed_over, "Graceful Failover was started for unhealthy node!!!") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse( failed_over, "Graceful failover should fail due to not enough replicas") return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue( reached, "rebalance failed for Graceful Failover, stuck or did not completed" ) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.buckets, std=20.0, num_replicas=self.num_replicas, total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover)) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.cluster_util.stop_server(node) self.log.info( "10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) nodes = self.filter_servers(self.servers, chosen) success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful)) # failed_over = self.task.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append( self.task.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() # failed_over.result() msg = "rebalance failed while removing failover nodes {0}" \ .format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) for task in compact_tasks: task.result() # msg = "rebalance failed while removing failover nodes {0}".format(node.id) # self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_all_buckets(self, gen, op): tasks = [] for bucket in self.bucket_util.buckets: tasks.append( self.task.async_load_gen_docs( self.cluster, bucket, gen, op, 0, batch_size=20, process_concurrency=1, durability=self.durability_level, timeout_secs=self.sdk_timeout, #durability_timeout=self.durability_timeout )) for task in tasks: self.task.jython_task_manager.get_task_result(task) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 self.load_all_buckets(self.gen_initial_create, "create") self.bucket_util._wait_for_stats_all_buckets( check_ep_items_remaining=True) # self.bucket_util._verify_stats_all_buckets(self.servers, timeout=120) def run_mutation_operations(self): if ("create" in self.doc_ops): self.load_all_buckets(self.gen_create, "create") if ("update" in self.doc_ops): self.load_all_buckets(self.gen_update, "update") if ("delete" in self.doc_ops): self.load_all_buckets(self.gen_delete, "delete") def run_mutation_operations_after_failover(self): if ("create" in self.doc_ops): self.load_all_buckets(self.afterfailover_gen_create, "create") if ("update" in self.doc_ops): self.load_all_buckets(self.afterfailover_gen_update, "update") if ("delete" in self.doc_ops): self.load_all_buckets(self.afterfailover_gen_delete, "delete") def define_maps_during_failover(self, recoveryType=[]): """ Method to define nope ip, recovery type map """ recoveryTypeMap = {} index = 0 for server in self.chosen: if recoveryType: recoveryTypeMap[server.ip] = recoveryType[index] index += 1 return recoveryTypeMap def filter_servers(self, original_servers, filter_servers): """ Filter servers that have not failed over """ _servers_ = copy.deepcopy(original_servers) for failed in filter_servers: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) return _servers_ def verify_for_recovery_type(self, chosen=[], serverMap={}, buckets=[], recoveryTypeMap={}, fileMap={}, deltaRecoveryBuckets=[]): """ Verify recovery type is delta or full """ summary = "" logic = True for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) os_type = shell.extract_remote_info() if os_type.type.lower() == 'windows': return for bucket in buckets: path = fileMap[server.ip][bucket.name] exists = shell.file_exists(path, "check.txt") if deltaRecoveryBuckets != None: if recoveryTypeMap[server.ip] == "delta" and ( bucket.name in deltaRecoveryBuckets) and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format( server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "delta" and ( bucket.name not in deltaRecoveryBuckets) and exists: summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format( server.ip, bucket.name) logic = False else: if recoveryTypeMap[server.ip] == "delta" and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format( server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "full" and exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format( server.ip, bucket.name) shell.disconnect() self.assertTrue(logic, summary) def run_view_creation_operations(self, servers): """" Run view Creation and indexing building tasks on servers """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) num_tries = self.input.param("num_tries", 10) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000 query["full_set"] = "true" views = [] tasks = [] for bucket in self.buckets: temp = self.bucket_util.make_default_views(self.default_view, self.default_view_name, num_views, is_dev_ddoc, different_map=False) temp_tasks = self.bucket_util.async_create_views( self.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = max( self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items / 50000) for task in tasks: task.result(self.wait_timeout * 20) def query_and_monitor_view_tasks(self, servers): """ Monitor Query Tasks for their completion """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] self.verify_query_task() active_tasks = self.cluster_util.async_monitor_active_task( servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = self.task.jython_task_manager.get_task_result(active_task) self.assertTrue(result) def verify_query_task(self): """ Verify Query Results """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = dict() query["connectionTimeout"] = 60000 query["full_set"] = "true" expected_rows = None timeout = None if self.active_resident_threshold == 0: timeout = 2400 if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.buckets: self.bucket_util.perform_verify_queries( num_views, prefix, ddoc_name, self.default_view_name, query, bucket=bucket, wait_time=timeout, expected_rows=expected_rows) def create_file(self, chosen, buckets, serverMap): """ Created files in data paths for checking if delta/full recovery occured """ fileMap = {} for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) type = shell.extract_remote_info().distribution_type map = dict() for bucket in buckets: if type.lower() == 'windows': self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data' bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt" full_path = self.data_path + "/" + bucket.name + "/" map[bucket.name] = full_path shell.create_file(bucket_data_path, "check") fileMap[server.ip] = map shell.disconnect() return fileMap def verify_failover_type(self, chosen=None, graceful_count=0, replica_count=0, unreachable=False): logic = True summary = "" nodes = self.rest.node_statuses() node_count = len(nodes) change_graceful_count = graceful_count graceful_failover = True if unreachable: node_count -= 1 else: change_graceful_count += 1 if replica_count != 0: for node in nodes: if unreachable and node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format( node.ip) elif node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if replica_count > graceful_count and ( node_count - 1) + graceful_count >= replica_count: if not node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format( node.ip) else: if node.gracefulFailoverPossible: logic = False summary += "\n failover type for {0} Expected :: Hard, Actual :: Graceful".format( node.ip) else: for node in nodes: if node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format( node.ip) self.assertTrue(logic, summary) return change_graceful_count, graceful_failover def victim_node_operations(self, node=None): if self.stopGracefulFailover: self.log.info(" Stopping Graceful Failover ") stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="unable to stop rebalance") if self.killNodes: self.log.info(" Killing Memcached ") kill_nodes = self.cluster_util.get_victim_nodes( self.servers, self.master, node, self.victim_type, self.victim_count) for kill_node in kill_nodes: self.cluster_util.kill_server_memcached(kill_node) if self.stopNodes: self.log.info(" Stopping Node") stop_nodes = self.cluster_util.get_victim_nodes( self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.cluster_util.stop_server(stop_node) self.sleep(10) for start_node in stop_nodes: self.cluster_util.start_server(start_node) if self.firewallOnNodes: self.log.info(" Enabling Firewall for Node ") stop_nodes = self.cluster_util.get_victim_nodes( self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.cluster_util.start_firewall_on_node(stop_node) self.sleep(30) self.log.info(" Disable Firewall for Node ") for start_node in stop_nodes: self.cluster_util.stop_firewall_on_node(start_node) self.sleep(60) def get_server_map(self): """ Map of ips and server information """ server_map = dict() for server in self.cluster.servers: server_map[server.ip] = server return server_map
def create_rest(server_ip=cfg.COUCHBASE_IP, port=cfg.COUCHBASE_PORT, username=cfg.COUCHBASE_USER, password=cfg.COUCHBASE_PWD): return RestConnection(create_server_obj(server_ip, port, username, password))
def cleanup_cluster(servers, wait_for_rebalance=True, master=None): log = logger.Logger.get_logger() if master is None: master = servers[0] rest = RestConnection(master) helper = RestHelper(rest) helper.is_ns_server_running( timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id for node in nodes: if int(node.port) in xrange(9091, 9991): rest.eject_node(node) nodes.remove(node) if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \ [node.id for node in nodes if node.id != master_id]) removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[ node.id for node in nodes if node.id != master_id ], wait_for_rebalance=wait_for_rebalance) success_cleaned = [] alt_addr = TestInputSingleton.input.param("alt_addr", False) for removed in [node for node in nodes if (node.id != master_id)]: removed.rest_password = servers[0].rest_password removed.rest_username = servers[0].rest_username try: if alt_addr: for server in servers: shell = RemoteMachineShellConnection(server) internal_IP = shell.get_ip_address() internal_IP = [ x for x in internal_IP if x != "127.0.0.1" ] shell.disconnect() if internal_IP == removed.ip: rest = RestConnection(server) break else: rest = RestConnection(removed) except Exception as ex: log.error( "can't create rest connection after rebalance out for ejected nodes,\ will retry after 10 seconds according to MB-8430: {0} " .format(ex)) time.sleep(10) rest = RestConnection(removed) start = time.time() while time.time() - start < 30: if len(rest.get_pools_info()["pools"]) == 0: success_cleaned.append(removed) break else: time.sleep(0.1) if time.time() - start > 10: log.error("'pools' on node {0}:{1} - {2}".format( removed.ip, removed.port, rest.get_pools_info()["pools"])) for node in set([node for node in nodes if (node.id != master_id) ]) - set(success_cleaned): log.error( "node {0}:{1} was not cleaned after removing from cluster". format(removed.ip, removed.port)) try: if alt_addr: for server in servers: shell = RemoteMachineShellConnection(server) internal_IP = shell.get_ip_address() internal_IP = [ x for x in internal_IP if x != "127.0.0.1" ] shell.disconnect() if internal_IP == removed.ip: rest = RestConnection(server) break else: rest = RestConnection(node) if not alt_addr: rest.force_eject_node() except Exception as ex: log.error("force_eject_node {0}:{1} failed: {2}".format( removed.ip, removed.port, ex)) if len(set([node for node in nodes if (node.id != master_id)])\ - set(success_cleaned)) != 0: if not alt_addr: raise Exception( "not all ejected nodes were cleaned successfully") log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \ [(node.id, node.port) for node in nodes if (node.id != master_id)]))
def common_test_body(self, failover_reason, rebalance_type=None): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Variable to decide the durability outcome durability_will_fail = False # Variable to track the number of nodes failed num_nodes_failed = 1 # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 \ and (self.graceful or self.recoveryType is not None): self.log.error( "Can't apply graceful failover to nodes with version < 3.*") self.log.error("Please check configuration params: SKIPPING TEST") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and # check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validation record_static_data_set = {} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) target_bucket = self.bucket_util.buckets[0] # Update new_replica value, if provided in the conf if self.new_replica: self.num_replicas = self.new_replica bucket_helper = BucketHelper(self.master) bucket_helper.change_bucket_props(target_bucket.name, replicaNumber=self.num_replicas) # Decide whether the durability is going to fail or not if self.num_failed_nodes >= 1 and self.num_replicas > 1: durability_will_fail = True # Construct target vbucket list from the nodes # which are going to be failed over vbucket_list = list() for target_node in self.chosen: for server in self.servers: if server.ip == target_node.ip: # Comment out the break once vbucket_list method is fixed break shell_conn = RemoteMachineShellConnection(server) cb_stats = Cbstats(shell_conn) vbuckets = cb_stats.vbucket_list(target_bucket.name, self.target_vbucket_type) shell_conn.disconnect() vbucket_list += vbuckets # Code to generate doc_loaders that will work on vbucket_type # based on targeted nodes. This will perform CRUD only on # vbuckets which will be affected by the failover self.gen_create = doc_generator(self.key, self.num_items, self.num_items * 1.5, target_vbucket=vbucket_list) self.gen_update = doc_generator(self.key, self.num_items / 2, self.num_items, target_vbucket=vbucket_list) self.gen_delete = doc_generator(self.key, self.num_items / 4, self.num_items / 2 - 1, target_vbucket=vbucket_list) self.afterfailover_gen_create = doc_generator( self.key, self.num_items * 1.6, self.num_items * 2, target_vbucket=vbucket_list) self.afterfailover_gen_update = doc_generator( self.key, 1, self.num_items / 4, target_vbucket=vbucket_list) self.afterfailover_gen_delete = doc_generator( self.key, self.num_items * 0.5, self.num_items * 0.75, target_vbucket=vbucket_list) # Perform Add Back Operation with Rebalance # or only Rebalance with verifications if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.failover_onebyone: # Reset it back to False durability_will_fail = False for node_chosen in self.chosen: if num_nodes_failed > 1: durability_will_fail = True if self.add_back_flag: # In add-back case, durability should never fail, since # the num_nodes in the cluster will remain the same self.run_add_back_operation_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( [node_chosen], prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) num_nodes_failed += 1 else: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail, rebalance_type=rebalance_type) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats, durability_will_fail=durability_will_fail) else: return # Will verify_unacked_bytes only if the durability is not going to fail if self.during_ops is None and not durability_will_fail: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list)
def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.cluster.servers[1] else: self.master = self.cluster.master self.log.info( " Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error( "Graceful failover can't be applied to nodes with version less then 3.*" ) self.log.error( "Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set = {} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.bucket_util.get_data_set_all( self.cluster.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.bucket_util.get_failovers_logs( self.servers, self.buckets) # Perform Operations relalted to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: return if self.during_ops == None: self.bucket_util.verify_unacked_bytes_all_buckets( filter_list=self.filter_list, master_node=self.master)
def set_vbuckets(master, vbuckets): rest = RestConnection(master) command = "rpc:eval_everywhere(ns_config, set, [couchbase_num_vbuckets_default, {0}]).".format( vbuckets) status, content = rest.diag_eval(command) return status, content
def test_xdcr(self): _rep_type = self.input.param("replication_type", "capi") # capi or xmem _bixdcr = self.input.param("bidirectional", "false") _clusters_dic = self.input.clusters _src_nodes = copy.copy(_clusters_dic[0]) _src_master = _src_nodes[0] _dest_nodes = copy.copy(_clusters_dic[1]) _dest_master = _dest_nodes[0] # Build source cluster self.init_rebalance_cluster_create_testbucket(_src_master, _src_nodes) # Build destination cluster self.init_rebalance_cluster_create_testbucket(_dest_master, _dest_nodes) # Setting up XDCR self.setup_xdcr_start_replication(_src_master, _dest_master, _rep_type, _bixdcr) shell1 = RemoteMachineShellConnection(_src_master) shell2 = RemoteMachineShellConnection(_dest_master) src_item_count = 0 dest_item_count = 0 if self._os == "centos" or self._os == "ubuntu": self.log.info("Load {0} through cbworkloadgen at src..".format( self.num_items)) _1 = "cd /home/{0}/opt/couchbase &&".format( _src_master.ssh_username) _2 = " ./bin/cbworkloadgen -n localhost:8091 --prefix=s_" _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format( self.num_items) _4 = " -u {0} -p {1}".format(_src_master.rest_username, _src_master.rest_password) command_to_load = _1 + _2 + _3 + _4 o, e = shell1.execute_non_sudo_command(command_to_load) shell1.log_command_output(o, e) time.sleep(20) rest = RestConnection(_src_master) src_item_count = rest.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] if _bixdcr: self.log.info("Load {0} through cbworkloadgen at src..".format( self.num_items)) _1 = "cd /home/{0}/opt/couchbase &&".format( _dest_master.ssh_username) _2 = " ./bin/cbworkloadgen -n localhost:8091 --prefix=d_" _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format( self.num_items) _4 = " -u {0} -p {1}".format(_dest_master.rest_username, _dest_master.rest_password) command_to_load = _1 + _2 + _3 + _4 o, e = shell2.execute_non_sudo_command(command_to_load) shell2.log_command_output(o, e) time.sleep(20) rest = RestConnection(_dest_master) dest_item_count = rest.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] self.wait_for_replication_to_catchup(_src_master, _dest_master, 1200, "destination") if _bixdcr: self.wait_for_replication_to_catchup(_dest_master, _src_master, 1200, "source") self.log.info("XDC REPLICATION caught up") rest1 = RestConnection(_src_master) rest2 = RestConnection(_dest_master) curr_count_on_src = rest1.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] curr_count_on_dest = rest2.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] assert curr_count_on_src == ( src_item_count + dest_item_count), "ItemCount on source not what's expected" assert curr_count_on_dest == ( src_item_count + dest_item_count ), "ItemCount on destination not what's expected" elif self._os == "windows": # TODO: Windows support self.log.info("Yet to add support for windows!") pass shell1.disconnect() shell2.disconnect()
count = 0 prefix = str(uuid.uuid4())[:6] count = 10 * 1000 * 1000 * 1000 size = 512 expiry = 0 if "count" in params: count = int(params["count"]) print count if "size" in params: size = int(params["size"]) if "prefix" in params: prefix = params["prefix"] #if "expiry" in params: # expiry = int(params["expiry"]) payload = MemcachedClientHelper.create_value('*', size) rest = RestConnection(server) buckets = rest.get_buckets() for bucket in buckets: smart = VBucketAwareMemcached(rest, bucket.name) mc = MemcachedClientHelper.proxy_client(server, bucket.name) i = 0 while i < count: try: key = "{0}-{1}".format(prefix, i) #mc = smart.memcached(key) #do an expiry every 10 times mc.set(key, 0, 0, payload) #mc.get(key) i += 1 except Exception as ex: #print ex
def _get_cluster_ca_cert(self): rest = RestConnection(self.host) api = rest.baseUrl + "pools/default/certificate?extended=true" status, content, header = rest._http_request(api, 'GET') return status, content, header
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master)
def test_ingestion_after_kv_rollback_cbas_disconnected(self): self.setup_for_test() # Stop Persistence on Node A & Node B self.log.info("Stopping persistence on NodeA") mem_client = MemcachedClientHelper.direct_client( self.cluster.master, self.cb_bucket_name) mem_client.stop_persistence() # Perform Create, Update, Delete ops in the CB bucket self.log.info("Performing Mutations") self.perform_doc_ops_in_all_cb_buckets("delete", 0, self.num_items / 2) # Count no. of items in CB & CBAS Buckets kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master) items_in_cb_bucket = 0 for node in kv_nodes: items_in_cb_bucket += self.get_item_count(node, self.cb_bucket_name) items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) items_before_rollback = items_in_cbas_bucket self.log.info( "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket" ) self.cbas_util.disconnect_from_bucket(self.cbas_bucket_name) # Kill memcached on Node A so that Node B becomes master self.log.info("Kill Memcached process on NodeA") shell = RemoteMachineShellConnection(self.cluster.master) shell.kill_memcached() # self.sleep(10,"Wait for 10 secs for memcached restarts.") if self.input.param('kill_cbas', False): shell = RemoteMachineShellConnection(self.cbas_node) shell.kill_process("/opt/couchbase/lib/cbas/runtime/bin/java", "java") shell.kill_process("/opt/couchbase/bin/cbas", "cbas") tries = 60 result = False while tries > 0 and not result: try: result = self.cbas_util.connect_to_bucket( self.cbas_bucket_name) tries -= 1 except: pass self.sleep(2) self.assertTrue( result, "CBAS connect bucket failed after memcached killed on KV node.") curr = time.time() while items_in_cbas_bucket != 0 and items_in_cbas_bucket <= items_before_rollback: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) if curr + 120 < time.time(): break self.assertTrue(items_in_cbas_bucket > items_before_rollback, "Roll-back did not happen.") self.log.info("#######BINGO########\nROLLBACK HAPPENED") curr = time.time() while items_in_cb_bucket != items_in_cbas_bucket: items_in_cb_bucket = 0 items_in_cbas_bucket = 0 if self.where_field and self.where_value: try: items_in_cb_bucket = RestConnection( self.cluster.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] except: self.log.info( "Indexer in rollback state. Query failed. Pass and move ahead." ) pass else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) self.log.info("Items in CB bucket after rollback: %s" % items_in_cb_bucket) items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) if curr + 120 < time.time(): break # Count no. of items in CB & CBAS Buckets items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info( "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "After Rollback : # Items in CBAS bucket does not match that in the CB bucket" )
def test_bucket_backup_restore(self): shell = RemoteMachineShellConnection(self.master) self.init_rebalance_cluster_create_testbucket(self.master, self.servers) if self._os == "centos" or self._os == "ubuntu": self.log.info("Load {0} through cbworkloadgen ..".format( self.num_items)) _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/cbworkloadgen -n localhost:8091" _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format( self.num_items) _4 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_load = _1 + _2 + _3 + _4 o, e = shell.execute_non_sudo_command(command_to_load) shell.log_command_output(o, e) time.sleep(20) rest = RestConnection(self.master) ini_item_count = rest.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] self.log.info("Backing up bucket 'testbucket' ..") _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/cbbackup http://localhost:8091" _3 = " /home/{0}/backup".format(self.master.ssh_username) _4 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_backup = _1 + _2 + _3 + _4 o, e = shell.execute_non_sudo_command(command_to_backup) shell.log_command_output(o, e) time.sleep(10) self.log.info("Deleting bucket ..") _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/couchbase-cli bucket-delete -c localhost:8091" _3 = " --bucket=testbucket" _4 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_delete_bucket = _1 + _2 + _3 + _4 o, e = shell.execute_non_sudo_command(command_to_delete_bucket) shell.log_command_output(o, e) time.sleep(20) if len(self.servers) < 2: rep_count = 0 else: rep_count = 1 self.log.info("Recreating bucket ..") _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/couchbase-cli bucket-create -c localhost:8091" _3 = " --bucket=testbucket --bucket-type=couchbase --bucket-port=11211" _4 = " --bucket-ramsize=500 --bucket-replica={0} --wait".format( rep_count) _5 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_create_bucket = _1 + _2 + _3 + _4 + _5 o, e = shell.execute_non_sudo_command(command_to_create_bucket) shell.log_command_output(o, e) time.sleep(20) self.log.info("Restoring bucket 'testbucket' ..") _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/cbrestore /home/{0}/backup http://localhost:8091".format( self.master.ssh_username) _3 = " -b testbucket -B testbucket" _4 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_restore = _1 + _2 + _3 + _4 o, e = shell.execute_non_sudo_command(command_to_restore) shell.log_command_output(o, e) time.sleep(10) rest = RestConnection(self.master) fin_item_count = rest.fetch_bucket_stats( bucket="testbucket")["op"]["samples"]["curr_items"][-1] self.log.info("Removing backed-up folder ..") command_to_remove_folder = "rm -rf /home/{0}/backup".format( self.master.ssh_username) o, e = shell.execute_non_sudo_command(command_to_remove_folder) shell.log_command_output(o, e) if (fin_item_count == ini_item_count): self.log.info( "Item count before and after deleting with backup/restore matched, {0}={1}" .format(fin_item_count, ini_item_count)) else: self.fail( "Item count didnt match - backup/restore, {0}!={1}".format( fin_item_count, ini_item_count)) self.log.info("Deleting testbucket ..") _1 = "cd /home/{0}/opt/couchbase &&".format( self.master.ssh_username) _2 = " ./bin/couchbase-cli bucket-delete -c localhost:8091" _3 = " --bucket=testbucket" _4 = " -u {0} -p {1}".format(self.master.rest_username, self.master.rest_password) command_to_delete_bucket = _1 + _2 + _3 + _4 o, e = shell.execute_non_sudo_command(command_to_delete_bucket) shell.log_command_output(o, e) time.sleep(10) elif self._os == "windows": # TODO: Windows support self.log.info("Yet to add support for windows!") pass shell.disconnect()
def _cluster_setup(self): replicas = self.input.param("replicas", 1) keys_count = self.input.param("keys-count", 0) num_buckets = self.input.param("num-buckets", 1) bucket_name = "default" master = self.servers[0] credentials = self.input.membase_settings rest = RestConnection(self.master) info = rest.get_nodes_self() rest.init_cluster(username=self.master.rest_username, password=self.master.rest_password) rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved) rest.reset_autofailover() ClusterOperationHelper.add_and_rebalance(self.servers, True) if num_buckets == 1: bucket_ram = info.memoryQuota * 2 / 3 rest.create_bucket(bucket=bucket_name, ramQuotaMB=bucket_ram, replicaNumber=replicas, proxyPort=info.moxi) else: created = BucketOperationHelper.create_multiple_buckets( self.master, replicas, howmany=num_buckets) self.assertTrue(created, "unable to create multiple buckets") buckets = rest.get_buckets() for bucket in buckets: ready = BucketOperationHelper.wait_for_memcached( self.master, bucket.name) self.assertTrue(ready, msg="wait_for_memcached failed") for bucket in buckets: inserted_keys_cnt = self.load_data(self.master, bucket.name, keys_count) log.info('inserted {0} keys'.format(inserted_keys_cnt))
def test_rebalance_kv_rollback_create_ops(self): self.setup_for_test() items_before_persistence_stop = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name)[0] self.log.info("Items in CBAS before persistence stop: %s" % items_before_persistence_stop) # Stop Persistence on Node A & Node B self.log.info("Stopping persistence on NodeA") mem_client = MemcachedClientHelper.direct_client( self.cluster.master, self.cb_bucket_name) mem_client.stop_persistence() # Perform Create, Update, Delete ops in the CB bucket self.log.info("Performing Mutations") self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 3 / 2) kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master) items_in_cb_bucket = 0 if self.where_field and self.where_value: items_in_cb_bucket = RestConnection( self.cluster.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) # Validate no. of items in CBAS dataset self.assertTrue( self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, items_in_cb_bucket, 0), "No. of items in CBAS dataset do not match that in the CB bucket") # Count no. of items in CB & CBAS Buckets items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info( "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket" ) if self.CC: self.cluster_util.remove_node([self.otpNodes[0]], wait_for_rebalance=False) self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") else: self.cluster_util.remove_node([self.otpNodes[1]], wait_for_rebalance=False) # Kill memcached on Node A so that Node B becomes master self.log.info("Kill Memcached process on NodeA") shell = RemoteMachineShellConnection(self.cluster.master) shell.kill_memcached() self.sleep(2, "Wait for 2 secs for DCP rollback sent to CBAS.") curr = time.time() while items_in_cbas_bucket == -1 or ( items_in_cbas_bucket != 0 and items_in_cbas_bucket > items_before_persistence_stop): try: if curr + 120 < time.time(): break items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items in CBAS: %s" % items_in_cbas_bucket) except: self.log.info( "Probably rebalance is in progress and the reason for queries being failing." ) pass self.assertTrue(items_in_cbas_bucket <= items_before_persistence_stop, "Roll-back did not happen.") self.log.info("#######BINGO########\nROLLBACK HAPPENED") items_in_cb_bucket = 0 curr = time.time() while items_in_cb_bucket != items_in_cbas_bucket or items_in_cb_bucket == 0: items_in_cb_bucket = 0 items_in_cbas_bucket = 0 if self.where_field and self.where_value: try: items_in_cb_bucket = RestConnection( self.cluster.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] except: self.log.info( "Indexer in rollback state. Query failed. Pass and move ahead." ) pass else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) self.log.info("Items in CB bucket after rollback: %s" % items_in_cb_bucket) try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass if curr + 120 < time.time(): break str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: self.sleep(1) self.log.info("Waiting for rebalance to complete") self.log.info( "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "After Rollback : # Items in CBAS bucket does not match that in the CB bucket" )
def _cluster_setup(self): keys_count = self.input.param("keys-count", 0) num_buckets = self.input.param("num-buckets", 1) bucketType = self.input.param("bucketType", "ephemeral") evictionPolicy = self.input.param("evictionPolicy", "noEviction") # fullEviction # master = self.servers[0] # credentials = self.input.membase_settings rest = RestConnection(self.master) info = rest.get_nodes_self() rest.init_cluster(username=self.master.rest_username, password=self.master.rest_password) memory = min(info.mcdMemoryReserved, self.input.param("kv_memory", 1000)) rest.init_cluster_memoryQuota(memoryQuota=memory) rest.reset_autoreprovision() self._add_and_rebalance(self.servers, True) if num_buckets == 1: bucket_name = "default" bucket_ram = info.memoryQuota * 2 // 3 rest.create_bucket(bucket=bucket_name, ramQuotaMB=bucket_ram, replicaNumber=self.replicas, proxyPort=info.moxi, bucketType=bucketType, evictionPolicy=evictionPolicy) else: created = BucketOperationHelper.create_multiple_buckets( self.master, self.replicas, howmany=num_buckets, bucketType=bucketType, evictionPolicy=evictionPolicy) self.assertTrue(created, "unable to create multiple buckets") buckets = rest.get_buckets() for bucket in buckets: ready = BucketOperationHelper.wait_for_memcached(self.master, bucket.name) self.assertTrue(ready, msg="wait_for_memcached failed") for bucket in buckets: distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_count, rejected_count = self.load_bucket_and_return_the_keys(servers=[self.master], name=bucket.name, # ram_load_ratio=0.02, value_size_distribution=distribution, write_only=True, moxi=True, number_of_threads=2, number_of_items=keys_count) self.loaded_items[bucket.name] = inserted_count
def test_ingestion_after_kv_rollback_delete_ops(self): self.setup_for_test() # Stop Persistence on Node A & Node B self.log.info("Stopping persistence on NodeA") mem_client = MemcachedClientHelper.direct_client( self.cluster.master, self.cb_bucket_name) mem_client.stop_persistence() # Perform Create, Update, Delete ops in the CB bucket self.log.info("Performing Mutations") self.perform_doc_ops_in_all_cb_buckets("delete", 0, self.num_items / 2) kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master) items_in_cb_bucket = 0 if self.where_field and self.where_value: items_in_cb_bucket = RestConnection( self.cluster.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) # Validate no. of items in CBAS dataset self.assertTrue( self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, items_in_cb_bucket, 0), "No. of items in CBAS dataset do not match that in the CB bucket") # Count no. of items in CB & CBAS Buckets items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) items_before_rollback = items_in_cbas_bucket self.log.info( "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket" ) # Kill memcached on Node A so that Node B becomes master self.log.info("Kill Memcached process on NodeA") shell = RemoteMachineShellConnection(self.cluster.master) shell.kill_memcached() self.sleep(2, "Wait for 2 secs for DCP rollback sent to CBAS.") curr = time.time() while items_in_cbas_bucket != 0 and items_in_cbas_bucket <= items_before_rollback: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) if curr + 120 < time.time(): break self.assertTrue(items_in_cbas_bucket > items_before_rollback, "Roll-back did not happen.") self.log.info("#######BINGO########\nROLLBACK HAPPENED") items_in_cb_bucket = 0 curr = time.time() while items_in_cb_bucket != items_in_cbas_bucket: items_in_cb_bucket = 0 items_in_cbas_bucket = 0 if self.where_field and self.where_value: try: items_in_cb_bucket = RestConnection( self.cluster.master).query_tool( 'select count(*) from %s where %s = "%s"' % (self.cb_bucket_name, self.where_field, self.where_value))['results'][0]['$1'] except: self.log.info( "Indexer in rollback state. Query failed. Pass and move ahead." ) pass else: for node in kv_nodes: items_in_cb_bucket += self.get_item_count( node, self.cb_bucket_name) self.log.info("Items in CB bucket after rollback: %s" % items_in_cb_bucket) items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) if curr + 120 < time.time(): break self.log.info( "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s", items_in_cb_bucket, items_in_cbas_bucket) self.assertTrue( items_in_cb_bucket == items_in_cbas_bucket, "After Rollback : # Items in CBAS bucket does not match that in the CB bucket" )
class AutoReprovisionTests(unittest.TestCase): def setUp(self): self.input = TestInputSingleton.input self.case_number = self.input.param("case_number", 0) self.use_master = self.input.param("use_master", False) self.skip_services = self.input.param("skip_services", True) self.replicas = self.input.param("replicas", 1) self.servers = self.input.servers self.log = logger.Logger().get_logger() self.master = self.servers[0] self.rest = RestConnection(self.master) self.timeout = 60 self.loaded_items = dict() AutoReprovisionBaseTest.common_setup(self.input, self) self._cluster_setup() if self.use_master: self.server_fail = self.servers[0] self.master = self.servers[1] else: self.server_fail = self.servers[1] for server in self.servers: remote = RemoteMachineShellConnection(server) output, error = remote.enable_diag_eval_on_non_local_hosts() def tearDown(self): AutoReprovisionBaseTest.common_tearDown(self.servers, self) def sleep(self, timeout=1, message=""): self.log.info("sleep for {0} secs. {1} ...".format(timeout, message)) time.sleep(timeout) def test_default_values(self): # read settings and verify settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 1) self.assertEqual(settings.count, 0) def test_enable(self): status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') # read settings and verify settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 0) def test_disable(self): status = self.rest.update_autoreprovision_settings(False, 2) if not status: self.fail('failed to change autoreprovision_settings!') # read settings and verify settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, False) self.assertEqual(settings.max_nodes, 1) self.assertEqual(settings.count, 0) def test_valid_max_nodes(self): max_nodes = [1, 2, 5, 10, 100] # 0? for max_node in max_nodes: status = self.rest.update_autoreprovision_settings(True, max_node) if not status: self.fail('failed to change autoreprovision_settings!') # read settings and verify settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.max_nodes, max_node) def test_node_firewall_enabled(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) RemoteUtilHelper.enable_firewall(self.server_fail) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_node_stop(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._stop_couchbase(self.server_fail) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(30) self._start_couchbase(self.server_fail) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(10) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_node_cb_restart(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.restart_couchbase() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_node_reboot(self): wait_timeout = 120 timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) time.sleep(wait_timeout * 5) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_firewall_node_when_autoreprovisioning(self): wait_timeout = 120 before = self.input.param("before", True) timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") # self.sleep(5) if before: RemoteUtilHelper.enable_firewall(self.servers[2]) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) if not before: RemoteUtilHelper.enable_firewall(self.servers[2]) # self.sleep(5) try: self.rest.monitorRebalance() self.fail("Rebalance failed expected") except RebalanceFailedException: self.log.info("Rebalance failed but it's expected") shell = RemoteMachineShellConnection(self.servers[2]) shell.disable_firewall() self.sleep(5) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_node_memcached_failure(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._pause_couchbase(self.server_fail) self.sleep(5) AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) RemoteUtilHelper.common_basic_setup([self.server_fail]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_two_failed_nodes(self): timeout = self.timeout // 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertFalse(helper.is_cluster_healthy(), "cluster status is healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") self._start_couchbase(server_fail1) self._start_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self._start_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(20) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) def test_reset_count(self): timeout = self.timeout // 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 2) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 0) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) def test_invalid_max_nodes(self): max_nodes = [-30, -2, 0.5, 'df', True] # 3000000000 ? for max_node in max_nodes: status = self.rest.update_autoreprovision_settings(True, max_node) if status: self.fail('autoreprovision_settings have been changed incorrectly!') # read settings and verify settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.max_nodes, 1) def test_node_memcached_failure_in_series(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(range(len(self.servers))): print(self.servers[i]) operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print("operation", operation) if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name]) def test_ui_logs(self): timeout = self.timeout // 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) logs = self.rest.get_logs(5) self.assertTrue('Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs]) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') logs = self.rest.get_logs(5) self.assertTrue('auto-reprovision count reset from 0' in [l['text'] for l in logs]) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 2) logs = self.rest.get_logs(5) self.assertTrue('auto-reprovision is disabled as maximum number of nodes (2) ' 'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs]) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEqual(settings.enabled, True) self.assertEqual(settings.max_nodes, 2) self.assertEqual(settings.count, 0) logs = self.rest.get_logs(5) self.assertTrue('auto-reprovision count reset from 2' in [l['text'] for l in logs]) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) logs = self.rest.get_logs(5) # https://issues.couchbase.com/browse/MB-24520 self.assertFalse('Reset auto-failover count' in [l['text'] for l in logs]) self.assertTrue('Rebalance completed successfully.' in [l['text'] for l in logs]) def _stop_couchbase(self, server): shell = RemoteMachineShellConnection(server) shell.stop_couchbase() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) def _start_couchbase(self, server): shell = RemoteMachineShellConnection(server) shell.start_couchbase() shell.disconnect() log.info("srarted couchbase server on {0}".format(server)) # the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_couchbase(self, server): self._pause_beam(server) self._pause_memcached(server) # the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_memcached(self, server): shell = RemoteMachineShellConnection(server) shell.pause_memcached() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) # the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_beam(self, server): shell = RemoteMachineShellConnection(server) shell.pause_beam() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) @staticmethod def load_bucket_and_return_the_keys(servers=None, name='default', ram_load_ratio=-1, number_of_items=-1, value_size_distribution=None, number_of_threads=1, override_vBucketId=-1, write_only=False, moxi=True, delete_ratio=0, expiry_ratio=0): inserted_keys = [] rejected_keys = [] log = logger.Logger.get_logger() threads = MemcachedClientHelper.create_threads(servers, name, ram_load_ratio, number_of_items, value_size_distribution, number_of_threads, override_vBucketId, write_only=write_only, moxi=moxi, delete_ratio=delete_ratio, expiry_ratio=expiry_ratio) # we can start them! for thread in threads: thread.start() log.info("waiting for all worker thread to finish their work...") [thread.join() for thread in threads] log.info("worker threads are done...") inserted_count = 0 rejected_count = 0 deleted_count = 0 expired_count = 0 for thread in threads: t_inserted, t_rejected = thread.keys_set() inserted_count += thread.inserted_keys_count() rejected_count += thread.rejected_keys_count() deleted_count += thread._delete_count expired_count += thread._expiry_count inserted_keys.extend(t_inserted) rejected_keys.extend(t_rejected) msg = "inserted keys count : {0} , rejected keys count : {1}" log.info(msg.format(inserted_count, rejected_count)) msg = "deleted keys count : {0} , expired keys count : {1}" log.info(msg.format(deleted_count, expired_count)) return inserted_count, rejected_count def verify_loaded_data(self, master, bucket, inserted_count): rest = RestConnection(master) bucket_count_map = rest.get_buckets_itemCount() msg = "Before count : {0} , After count : {1}" log.info(msg.format(inserted_count, bucket_count_map[bucket])) self.assertEqual(bucket_count_map[bucket], inserted_count, msg="unable to verify keys after restore") def _cluster_setup(self): keys_count = self.input.param("keys-count", 0) num_buckets = self.input.param("num-buckets", 1) bucketType = self.input.param("bucketType", "ephemeral") evictionPolicy = self.input.param("evictionPolicy", "noEviction") # fullEviction # master = self.servers[0] # credentials = self.input.membase_settings rest = RestConnection(self.master) info = rest.get_nodes_self() rest.init_cluster(username=self.master.rest_username, password=self.master.rest_password) memory = min(info.mcdMemoryReserved, self.input.param("kv_memory", 1000)) rest.init_cluster_memoryQuota(memoryQuota=memory) rest.reset_autoreprovision() self._add_and_rebalance(self.servers, True) if num_buckets == 1: bucket_name = "default" bucket_ram = info.memoryQuota * 2 // 3 rest.create_bucket(bucket=bucket_name, ramQuotaMB=bucket_ram, replicaNumber=self.replicas, proxyPort=info.moxi, bucketType=bucketType, evictionPolicy=evictionPolicy) else: created = BucketOperationHelper.create_multiple_buckets( self.master, self.replicas, howmany=num_buckets, bucketType=bucketType, evictionPolicy=evictionPolicy) self.assertTrue(created, "unable to create multiple buckets") buckets = rest.get_buckets() for bucket in buckets: ready = BucketOperationHelper.wait_for_memcached(self.master, bucket.name) self.assertTrue(ready, msg="wait_for_memcached failed") for bucket in buckets: distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_count, rejected_count = self.load_bucket_and_return_the_keys(servers=[self.master], name=bucket.name, # ram_load_ratio=0.02, value_size_distribution=distribution, write_only=True, moxi=True, number_of_threads=2, number_of_items=keys_count) self.loaded_items[bucket.name] = inserted_count def _add_and_rebalance(self, servers, wait_for_rebalance=True): log = logger.Logger.get_logger() master = servers[0] all_nodes_added = True rebalanced = True rest = RestConnection(master) if len(servers) > 1: for serverInfo in servers[1:]: log.info('adding {0} node : {1}:{2} to the cluster'.format( serverInfo.services, serverInfo.ip, serverInfo.port)) otpNode = rest.add_node(master.rest_username, master.rest_password, serverInfo.ip, port=serverInfo.port, services=["kv"]) if otpNode: log.info('added node : {0} to the cluster'.format(otpNode.id)) else: all_nodes_added = False break if all_nodes_added: rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) if wait_for_rebalance: rebalanced &= rest.monitorRebalance() else: rebalanced = False return all_nodes_added and rebalanced
class AutoFailoverTests(unittest.TestCase): def setUp(self): self.input = TestInputSingleton.input self.case_number = self.input.param("case_number", 0) self.servers = self.input.servers self.log = logger.Logger().get_logger() self.master = self.servers[0] self.rest = RestConnection(self.master) self.timeout = 60 AutoFailoverBaseTest.common_setup(self.input, self) self._cluster_setup() def tearDown(self): AutoFailoverBaseTest.common_tearDown(self.servers, self) def sleep(self, timeout=1, message=""): self.log.info("sleep for {0} secs. {1} ...".format(timeout, message)) time.sleep(timeout) def test_enable(self): status = self.rest.update_autofailover_settings(True, self.timeout / 2) if not status: self.fail('failed to change autofailover_settings! See MB-7282') #read settings and verify settings = self.rest.get_autofailover_settings() self.assertEquals(settings.enabled, True) def test_disable(self): status = self.rest.update_autofailover_settings(False, self.timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') #read settings and verify settings = self.rest.get_autofailover_settings() self.assertEquals(settings.enabled, False) def test_valid_timeouts(self): timeouts = [30, 31, 300, 3600] for timeout in timeouts: status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail( 'failed to change autofailover_settings! See MB-7282') #read settings and verify settings = self.rest.get_autofailover_settings() self.assertTrue(settings.timeout == timeout) def test_30s_timeout_firewall(self): timeout = self.timeout / 2 server_fail = self.servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_60s_timeout_firewall(self): timeout = self.timeout server_fail = self.servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_30s_timeout_stop(self): timeout = self.timeout server_fail = self.servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self._stop_couchbase(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_60s_timeout_stop(self): timeout = self.timeout server_fail = self.servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self._stop_couchbase(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_reset_count(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autofailover count") if not self.rest.reset_autofailover(): self.fail('failed to reset autofailover count!') self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autofailover count") if not self.rest.reset_autofailover(): self.fail('failed to reset autofailover count!') def test_30s_timeout_pause(self): timeout = self.timeout / 2 server_fail = self.servers[1] shell = RemoteMachineShellConnection(server_fail) type = shell.extract_remote_info().distribution_type shell.disconnect() if type.lower() == 'windows': self.log.info( "test will be skipped because the signals SIGSTOP and SIGCONT do not exist for Windows" ) return status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self._pause_couchbase(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_60s_timeout_pause(self): timeout = self.timeout server_fail = self.servers[1] shell = RemoteMachineShellConnection(server_fail) type = shell.extract_remote_info().distribution_type shell.disconnect() if type.lower() == 'windows': self.log.info( "test will be skipped because the signals SIGSTOP and SIGCONT do not exist for Windows" ) return status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self._pause_couchbase(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def test_invalid_timeouts(self): timeouts = [-360, -60, 0, 15, 29, 300000] for timeout in timeouts: status = self.rest.update_autofailover_settings(True, timeout) if status: self.fail( 'autofailover_settings have been changed incorrectly!') #read settings and verify settings = self.rest.get_autofailover_settings() self.assertTrue(settings.timeout >= 30) def test_two_failed_nodes(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoFailoverBaseTest.wait_for_no_failover_or_assert( self.master, 2, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self) def _stop_couchbase(self, server): shell = RemoteMachineShellConnection(server) shell.stop_couchbase() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) #the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_couchbase(self, server): self._pause_beam(server) self._pause_memcached(server) #the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_memcached(self, server): shell = RemoteMachineShellConnection(server) shell.pause_memcached() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) #the signals SIGSTOP and SIGCONT do not exist for Windows def _pause_beam(self, server): shell = RemoteMachineShellConnection(server) shell.pause_beam() shell.disconnect() log.info("stopped couchbase server on {0}".format(server)) def load_data(self, master, bucket, keys_count): inserted_keys_cnt = 0 repeat_count = 0 while inserted_keys_cnt < keys_count and repeat_count < 5: keys_cnt, rejected_keys_cnt = \ MemcachedClientHelper.load_bucket(servers=[master], name=bucket, number_of_items=keys_count, number_of_threads=5, write_only=True) inserted_keys_cnt += keys_cnt if keys_cnt == 0: repeat_count += 1 else: repeat_count = 0 if repeat_count == 5: log.exception("impossible to load data") log.info("wait until data is completely persisted on the disk") RebalanceHelper.wait_for_persistence(master, bucket) return inserted_keys_cnt def _cluster_setup(self): replicas = self.input.param("replicas", 1) keys_count = self.input.param("keys-count", 0) num_buckets = self.input.param("num-buckets", 1) bucket_name = "default" master = self.servers[0] credentials = self.input.membase_settings rest = RestConnection(self.master) info = rest.get_nodes_self() rest.init_cluster(username=self.master.rest_username, password=self.master.rest_password) rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved) rest.reset_autofailover() ClusterOperationHelper.add_and_rebalance(self.servers, True) if num_buckets == 1: bucket_ram = info.memoryQuota * 2 / 3 rest.create_bucket(bucket=bucket_name, ramQuotaMB=bucket_ram, replicaNumber=replicas, proxyPort=info.moxi) else: created = BucketOperationHelper.create_multiple_buckets( self.master, replicas, howmany=num_buckets) self.assertTrue(created, "unable to create multiple buckets") buckets = rest.get_buckets() for bucket in buckets: ready = BucketOperationHelper.wait_for_memcached( self.master, bucket.name) self.assertTrue(ready, msg="wait_for_memcached failed") for bucket in buckets: inserted_keys_cnt = self.load_data(self.master, bucket.name, keys_count) log.info('inserted {0} keys'.format(inserted_keys_cnt))
def rebalance_out_with_failover(self): fail_over = self.input.param("fail_over", False) self.rest = RestConnection(self.master) gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() ejectedNode = self.find_node_info(self.master, self.servers[self.nodes_init - 1]) self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) record_data_set = self.get_data_set_all(self.servers[:self.nodes_init], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) new_server_list = self.add_remove_servers( self.servers, self.servers[:self.nodes_init], [self.servers[self.nodes_init - 1], chosen[0]], []) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self.verify_cluster_stats(new_server_list, check_ep_items_remaining=True) self.sleep(30) self.data_analysis_all(record_data_set, new_server_list, self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def test_node_memcached_failure_in_series(self): timeout = self.timeout // 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(range(len(self.servers))): print(self.servers[i]) operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print("operation", operation) if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
class RebalanceOutTests(RebalanceBaseTest): def setUp(self): super(RebalanceOutTests, self).setUp() def tearDown(self): super(RebalanceOutTests, self).tearDown() """Rebalances nodes out of a cluster while doing docs ops:create, delete, update. This test begins with all servers clustered together and loads a user defined number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read) in the cluster( operate with a half of items that were loaded before).It then remove nodes_out from the cluster at a time and rebalances. Once the cluster has been rebalanced we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We also check for data and its meta-data, vbucket sequene numbers""" def rebalance_out_after_ops(self): gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_failover_stats = self.get_failovers_logs( self.servers[:self.num_servers], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.num_servers], self.buckets) record_data_set = self.get_data_set_all( self.servers[:self.num_servers], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) rebalance = self.cluster.async_rebalance(self.servers[:1], [], servs_out) rebalance.result() self._verify_stats_all_buckets(self.servers[:self.num_servers - self.nodes_out], timeout=120) self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out], check_ep_items_remaining=True) new_failover_stats = self.compare_failovers_logs( prev_failover_stats, self.servers[:self.num_servers - self.nodes_out], self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos( prev_vbucket_stats, self.servers[:self.num_servers - self.nodes_out], self.buckets, perNode=False) self.sleep(60) self.data_analysis_all( record_data_set, self.servers[:self.num_servers - self.nodes_out], self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets) """Rebalances nodes out with failover and full recovery add back of a node This test begins with all servers clustered together and loads a user defined number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read) in the cluster( operate with a half of items that were loaded before).It then remove nodes_out from the cluster at a time and rebalances. Once the cluster has been rebalanced we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We also check for data and its meta-data, vbucket sequene numbers""" def rebalance_out_with_failover_full_addback_recovery(self): gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.num_servers], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.num_servers], self.buckets) record_data_set = self.get_data_set_all( self.servers[:self.num_servers], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full") rebalance = self.cluster.async_rebalance(self.servers[:1], [], servs_out) rebalance.result() self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out], check_ep_items_remaining=True) self.compare_failovers_logs( prev_failover_stats, self.servers[:self.num_servers - self.nodes_out], self.buckets) self.sleep(30) self.data_analysis_all( record_data_set, self.servers[:self.num_servers - self.nodes_out], self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets) """Rebalances nodes out with failover This test begins with all servers clustered together and loads a user defined number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read) in the cluster( operate with a half of items that were loaded before).It then remove nodes_out from the cluster at a time and rebalances. Once the cluster has been rebalanced we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We also check for data and its meta-data, vbucket sequene numbers""" def rebalance_out_with_failover(self): fail_over = self.input.param("fail_over", False) self.rest = RestConnection(self.master) gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() ejectedNode = self.find_node_info(self.master, self.servers[self.nodes_init - 1]) self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) record_data_set = self.get_data_set_all(self.servers[:self.nodes_init], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) new_server_list = self.add_remove_servers( self.servers, self.servers[:self.nodes_init], [self.servers[self.nodes_init - 1], chosen[0]], []) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self.verify_cluster_stats(new_server_list, check_ep_items_remaining=True) self.sleep(30) self.data_analysis_all(record_data_set, new_server_list, self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets) """Rebalances nodes out of a cluster while doing docs ops:create, delete, update. This test begins with all servers clustered together and loads a user defined number of items into the cluster. It then remove nodes_out from the cluster at a time and rebalances. During the rebalance we perform docs ops(add/remove/update/read) in the cluster( operate with a half of items that were loaded before). Once the cluster has been rebalanced we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. Once all nodes have been rebalanced the test is finished.""" def rebalance_out_with_ops(self): tasks = list() gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one if self.doc_ops is not None: if "update" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if "create" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if "delete" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) rebalance = self.cluster.async_rebalance( self.servers[:1], [], servs_out, sleep_before_rebalance=self.sleep_before_rebalance) rebalance.result() for task in tasks: task.result() self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out]) self.verify_unacked_bytes_all_buckets() # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption() """Rebalances nodes out of a cluster while doing docs ops:create, delete, update along with compaction. This test begins with all servers clustered together and loads a user defined number of items into the cluster. It then remove nodes_out from the cluster at a time and rebalances. During the rebalance we perform docs ops(add/remove/update/read) in the cluster( operate with a half of items that were loaded before). Once the cluster has been rebalanced we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. Once all nodes have been rebalanced the test is finished.""" def rebalance_out_with_compaction_and_ops(self): gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] tasks = [self.cluster.async_rebalance(self.servers[:1], [], servs_out)] for bucket in self.buckets: tasks.append(self.cluster.async_compact_bucket( self.master, bucket)) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out]) self.verify_unacked_bytes_all_buckets() """Rebalances nodes from a cluster during getting random keys. This test begins with all servers clustered together and loads a user defined number of items into the cluster. Then we send requests to all nodes in the cluster to get random key values. Next step is remove nodes_out from the cluster and rebalance it. During rebalancing we get random keys from all nodes and verify that are different every time. Once the cluster has been rebalanced we again get random keys from all new nodes in the cluster, than we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total.""" def rebalance_out_get_random_key(self): servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] # get random keys for new added nodes rest_cons = [ RestConnection(self.servers[i]) for i in xrange(self.num_servers) ] list_threads = [] for rest in rest_cons: t = Thread(target=rest.get_random_key, name="get_random_key", args=(self.default_bucket_name, )) list_threads.append(t) t.start() [t.join() for t in list_threads] rest_cons = [ RestConnection(self.servers[i]) for i in xrange(self.num_servers - self.nodes_out) ] rebalance = self.cluster.async_rebalance( self.servers[:self.num_servers], [], servs_out) self.sleep(2) result = [] num_iter = 0 # get random keys for each node during rebalancing while rest_cons[0]._rebalance_progress_status( ) == 'running' and num_iter < 100: list_threads = [] temp_result = [] self.log.info("getting random keys for all nodes in cluster....") for rest in rest_cons: t = Thread(target=rest.get_random_key, name="get_random_key", args=(self.default_bucket_name, )) list_threads.append(t) temp_result.append( rest.get_random_key(self.default_bucket_name)) t.start() [t.join() for t in list_threads] if tuple(temp_result) == tuple(result): self.log.exception("random keys are not changed") else: result = temp_result num_iter += 1 rebalance.result() self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out]) self.verify_unacked_bytes_all_buckets() """Rebalances nodes out of a cluster while doing docs' ops. This test begins with all servers clustered together and loads a user defined number of items into the cluster. It then removes two nodes at a time from the cluster and rebalances. During the rebalance we update (all of the items in the cluster)/delete(num_items/(num_servers -1) in each iteration)/create(a half of initial items in each iteration). Once the cluster has been rebalanced the test waits for the disk queues to drain and then verifies that there has been no data loss, sum(curr_items) match the curr_items_total. Once all nodes have been rebalanced out of the cluster the test finishes. """ def incremental_rebalance_out_with_ops(self): batch_size = 1000 for i in reversed(range(1, self.num_servers, 2)): if i == 1: batch_size = 1 tasks = list() if self.doc_ops is not None: # define which doc_ops will be performed during rebalancing # only one type of ops can be passed if "update" in self.doc_ops: # 1/2th of data will be updated in each iteration tasks += self._async_load_all_buckets( self.master, self.gen_update, "update", 0, batch_size=batch_size) elif "create" in self.doc_ops: # 1/2th of initial data will be added in each iteration gen_create = BlobGenerator( 'mike', 'mike-', self.value_size, start=self.num_items * (self.num_servers - i) / 2.0, end=self.num_items * (self.num_servers - i + 1) / 2.0) tasks += self._async_load_all_buckets( self.master, gen_create, "create", 0, batch_size=batch_size) elif "delete" in self.doc_ops: # 1/(num_servers) of initial data will be removed after each iteration # at the end we should get empty base( or couple items) gen_delete = BlobGenerator( 'mike', 'mike-', self.value_size, start=int(self.num_items * (1 - i / (self.num_servers - 1.0))) + 1, end=int(self.num_items * (1 - (i - 1) / (self.num_servers - 1.0)))) tasks += self._async_load_all_buckets( self.master, gen_delete, "delete", 0, batch_size=batch_size) rebalance = self.cluster.async_rebalance( self.servers[:i], [], self.servers[i:i + 2], sleep_before_rebalance=self.sleep_before_rebalance) try: rebalance.result() for task in tasks: task.result() except Exception, ex: rebalance.cancel() for task in tasks: task.cancel() raise ex self.verify_cluster_stats(self.servers[:i]) # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption() self.verify_unacked_bytes_all_buckets()
def setUp(self): super(CollectionBase, self).setUp() self.log_setup_status("CollectionBase", "started") self.key = 'test_collection'.rjust(self.key_size, '0') self.simulate_error = self.input.param("simulate_error", None) self.error_type = self.input.param("error_type", "memory") self.doc_ops = self.input.param("doc_ops", None) self.spec_name = self.input.param("bucket_spec", "single_bucket.default") self.over_ride_spec_params = \ self.input.param("override_spec_params", "").split(";") self.action_phase = self.input.param("action_phase", "before_default_load") self.crud_batch_size = 100 self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), self.durability_level) # Initialize cluster using given nodes nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Create bucket(s) and add rbac user self.bucket_util.add_rbac_user() buckets_spec = self.bucket_util.get_bucket_template_from_package( self.spec_name) doc_loading_spec = \ self.bucket_util.get_crud_template_from_package("initial_load") # Process params to over_ride values if required self.over_ride_template_params(buckets_spec) self.over_ride_template_params(doc_loading_spec) # MB-38438, adding CollectionNotFoundException in retry exception doc_loading_spec[MetaCrudParams.RETRY_EXCEPTIONS].append( SDKException.CollectionNotFoundException) self.bucket_util.create_buckets_using_json_data(buckets_spec) self.bucket_util.wait_for_collection_creation_to_complete() # Init sdk_client_pool if not initialized before if self.sdk_client_pool is None: self.init_sdk_pool_object() # Create clients in SDK client pool self.log.info("Creating required SDK clients for client_pool") bucket_count = len(self.bucket_util.buckets) max_clients = self.task_manager.number_of_threads clients_per_bucket = int(ceil(max_clients / bucket_count)) for bucket in self.bucket_util.buckets: self.sdk_client_pool.create_clients( bucket, [self.cluster.master], clients_per_bucket, compression_settings=self.sdk_compression) # TODO: remove this once the bug is fixed self.sleep(60, "MB-38497") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, doc_loading_spec, mutation_num=0) if doc_loading_task.result is False: self.fail("Initial doc_loading failed") self.cluster_util.print_cluster_stats() # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() self.bucket_util.print_bucket_stats() self.bucket_helper_obj = BucketHelper(self.cluster.master) self.log_setup_status("CollectionBase", "complete")