def replication_verification(master, bucket, replica, inserted_count, test): rest = RestConnection(master) nodes = rest.node_statuses() if len(nodes) / (1 + replica) >= 1: final_replication_state = RestHelper(rest).wait_for_replication(900) msg = "replication state after waiting for up to 15 minutes : {0}" test.log.info(msg.format(final_replication_state)) # in windows, we need to set timeout_in_seconds to 15+ minutes test.assertTrue(RebalanceHelper.wait_till_total_numbers_match(master=master, bucket=bucket, timeout_in_seconds=1200), msg="replication was completed but sum(curr_items) dont match the curr_items_total") start_time = time.time() stats = rest.get_bucket_stats() while time.time() < (start_time + 120) and stats["curr_items"] != inserted_count: test.log.info("curr_items : {0} versus {1}".format(stats["curr_items"], inserted_count)) time.sleep(5) stats = rest.get_bucket_stats() RebalanceHelper.print_taps_from_all_nodes(rest, bucket) test.log.info("curr_items : {0} versus {1}".format(stats["curr_items"], inserted_count)) stats = rest.get_bucket_stats() msg = "curr_items : {0} is not equal to actual # of keys inserted : {1}" test.assertEquals(stats["curr_items"], inserted_count, msg=msg.format(stats["curr_items"], inserted_count))
def run_test(self): ep_threshold = self.input.param("ep_threshold", "ep_mem_low_wat") active_resident_threshold = int(self.input.param("active_resident_threshold", 10)) mc = MemcachedClientHelper.direct_client(self.servers[0], self.bucket_name) stats = mc.stats() threshold = int(self.input.param("threshold", stats[ep_threshold])) threshold_reached = False self.num_items = self.input.param("items", 10000) self._load_doc_data_all_buckets("create") # load items till reached threshold or mem-ratio is less than resident ratio threshold while not threshold_reached: mem_used = int(mc.stats()["mem_used"]) if mem_used < threshold or int(mc.stats()["vb_active_perc_mem_resident"]) >= active_resident_threshold: self.log.info( "mem_used and vb_active_perc_mem_resident_ratio reached at %s/%s and %s " % (mem_used, threshold, mc.stats()["vb_active_perc_mem_resident"]) ) items = self.num_items self.num_items += self.input.param("items", 10000) self._load_doc_data_all_buckets("create", items) else: threshold_reached = True self.log.info("DGM state achieved!!!!") # wait for draining of data before restart and warm up for bucket in self.buckets: RebalanceHelper.wait_for_persistence(self.nodes_server[0], bucket) while 1: # read_data_task = self.cluster.async_verify_data(self.master, self.buckets[0], self.buckets[0].kvs[1]) read_data_task = Thread(target=self._run_get) read_data_task.start() # 5 threads to run stats all and reset asynchronously start = time.time() while (time.time() - start) < 300: stats_all_thread = [] stats_reset_thread = [] for i in xrange(self.threads_to_run): stat_str = "" stats_all_thread.append(Thread(target=self._get_stats, args=[stat_str])) stats_all_thread[i].start() stat_str = "reset" stats_reset_thread.append(Thread(target=self._get_stats, args=[stat_str])) stats_reset_thread[i].start() for i in xrange(self.threads_to_run): stats_all_thread[i].join() stats_reset_thread[i].join() del stats_all_thread del stats_reset_thread # read_data_task.result() read_data_task.join()
def create_ddocs(self, is_dev_view): mapview = View( self.map_view_name, """function(doc) { emit(doc.integer, doc.string); }""", dev_view=is_dev_view, ) self.cluster.create_view(self.master, "test", mapview) redview = View( self.red_view_name, """function(doc) { emit([doc.integer, doc.string], doc.integer); }""", """_count""", dev_view=is_dev_view, ) self.cluster.create_view(self.master, "test", redview) redview_stats = View( self.red_view_stats_name, """function(doc) { emit(doc.string, doc.string); }""", """_stats""", dev_view=is_dev_view, ) self.cluster.create_view(self.master, "test2", redview_stats) RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
def _verify_stats_all_buckets(self, servers, timeout=60): stats_tasks = [] for bucket in self.buckets: items = sum([len(kv_store) for kv_store in bucket.kvs.values()]) stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '', 'curr_items', '==', items)) stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '', 'vb_active_curr_items', '==', items)) available_replicas = self.num_replicas if len(servers) == self.num_replicas: available_replicas = len(servers) - 1 elif len(servers) <= self.num_replicas: available_replicas = len(servers) - 1 stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '', 'vb_replica_curr_items', '==', items * available_replicas)) stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '', 'curr_items_tot', '==', items * (available_replicas + 1))) try: for task in stats_tasks: task.result(timeout) except Exception as e: print e; for task in stats_tasks: task.cancel() self.log.error("unable to get expected stats for any node! Print taps for all nodes:") rest = RestConnection(self.master) for bucket in self.buckets: RebalanceHelper.print_taps_from_all_nodes(rest, bucket) raise Exception("unable to get expected stats during {0} sec".format(timeout))
def test_views_failover(self): num_nodes = self.input.param('num-nodes', 1) ddocs = self.make_ddocs(self.num_ddoc, self.views_per_ddoc, 0) RebalanceHelper.wait_for_persistence(self.master, self.bucket_name) self.cluster.failover(self.servers, self.servers[1:num_nodes]) self.cluster.rebalance(self.servers, [], self.servers[1:num_nodes]) self.perform_ddoc_ops(ddocs)
def _verify_data(self, master, rest, inserted_keys): log = logger.Logger.get_logger() log.info("Verifying data") ready = RebalanceHelper.wait_for_stats_on_all(master, "default", "ep_queue_size", 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(master, "default", "ep_flusher_todo", 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") BucketOperationHelper.keys_exist_or_assert(keys=inserted_keys, server=master, bucket_name="default", test=self)
def verify_data(master, inserted_keys, bucket, test): test.log.info("Verifying data") ready = RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_queue_size', 0) test.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_flusher_todo', 0) test.assertTrue(ready, "wait_for ep_queue_size == 0 failed") BucketOperationHelper.keys_exist_or_assert_in_parallel(keys=inserted_keys, server=master, \ bucket_name=bucket, test=test, concurrency=4)
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def _test_delete_key_and_backup_and_restore_body(self): bucket = "default" BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket, test_case=self) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket) self.assertTrue(ready, "wait_for_memcached failed") self.add_nodes_and_rebalance() client = MemcachedClientHelper.direct_client(self.master, "default") expiry = 2400 test_uuid = uuid.uuid4() keys = ["key_%s_%d" % (test_uuid, i) for i in range(500)] self.log.info("pushing keys with expiry set to {0}".format(expiry)) for key in keys: try: client.set(key, expiry, 0, "1") except mc_bin_client.MemcachedError as error: msg = "unable to push key : {0} to bucket : {1} error : {2}" self.log.error(msg.format(key, client.vbucketId, error.status)) self.fail(msg.format(key, client.vbucketId, error.status)) self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry)) client.delete(keys[0]) ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") #let's create a unique folder in the remote location for server in self.servers: shell = RemoteMachineShellConnection(server) output, error = shell.execute_command(self.perm_command) shell.log_command_output(output, error) node = RestConnection(server).get_nodes_self() BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder) shell.disconnect() for server in self.servers: BackupHelper(server, self).restore(self.remote_tmp_folder) time.sleep(10) self.log.info('verifying that all those keys...') missing_keys = [] verify_keys = [] for key in keys: vBucketId = crc32.crc32_hash(key) & 1023 # or & 0x3FF client.vbucketId = vBucketId if key == keys[0]: missing_keys.append(key) else: verify_keys.append(key) self.assertTrue(BucketOperationHelper.keys_dont_exist(self.master, missing_keys, self), "Keys are not empty") self.assertTrue(BucketOperationHelper.verify_data(self.master, verify_keys, False, False, 11210, self), "Missing keys")
def create_ddocs(self): mapview = View(self.map_view_name, '''function(doc) { emit(doc.integer, doc.string); }''', dev_view=self.is_dev_view) self.cluster.create_view(self.master, 'test', mapview) redview = View(self.red_view_name, '''function(doc) { emit([doc.integer, doc.string], doc.integer); }''', '''_count''', dev_view=self.is_dev_view) self.cluster.create_view(self.master, 'test', redview) RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
def test_parallel_DB_views_compaction(self): rest = RestConnection(self.master) self.set_auto_compaction(rest, parallelDBAndVC="true", viewFragmntThresholdPercentage=self.fragmentation_value, dbFragmentThresholdPercentage=self.fragmentation_value) self.make_ddocs(self.ddocs_num, self.view_per_ddoc) self.create_ddocs() self._load_all_buckets(self.master, self.gen_load, "create", 0) RebalanceHelper.wait_for_persistence(self.master, self.default_bucket_name) self._compaction_thread() if self.thread_crashed.is_set(): self.fail("Error occurred during run")
def _monitor_drain_queue(self): #start whenever drain_queue is > 0 rest = RestConnection(self.master) start = time.time() stats = rest.get_bucket_stats(self.bucket) self.log.info("current ep_queue_size: {0}".format(stats["ep_queue_size"])) verified = RebalanceHelper.wait_for_stats(self.master, self.bucket, 'ep_queue_size', 0, timeout_in_seconds=300, verbose=False)\ and RebalanceHelper.wait_for_stats(self.master, self.bucket, 'ep_flusher_todo', 0, timeout_in_seconds=300, verbose=False) self.drained = verified self.drained_in_seconds = time.time() - start
def test_parallel_enable_DB_compaction(self): rest = RestConnection(self.master) self.set_auto_compaction(rest, parallelDBAndVC="true", dbFragmentThresholdPercentage=self.fragmentation_value) self.make_ddocs(self.ddocs_num, self.view_per_ddoc) self.create_ddocs() self._load_all_buckets(self.master, self.gen_load, "create", 0) RebalanceHelper.wait_for_persistence(self.master, self.default_bucket_name) self._compaction_thread() if self.thread_crashed.is_set(): self.log.info("View Compaction is not started as expected")
def test_observe_with_warmup(self): self._load_doc_data_all_buckets('create', 0, self.num_items) # Persist all the loaded data item self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init]) for bucket in self.buckets: RebalanceHelper.wait_for_persistence(self.master, bucket) self._stats_befor_warmup(bucket.name) self._restart_memcache(bucket.name) # for bucket in self.buckets: ClusterOperationHelper._wait_warmup_completed(self, self.servers[:self.nodes_init], bucket.name) self._run_observe(self)
def _test_backup_and_restore_bucket_overwriting_body(self, overwrite_flag=True): bucket = "default" BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self) BucketOperationHelper.wait_for_memcached(self.master, bucket) self.add_nodes_and_rebalance() client = MemcachedClientHelper.direct_client(self.master, "default") expiry = 2400 test_uuid = uuid.uuid4() keys = ["key_%s_%d" % (test_uuid, i) for i in range(500)] self.log.info("pushing keys with expiry set to {0}".format(expiry)) for key in keys: try: client.set(key, expiry, 0, "1") except mc_bin_client.MemcachedError as error: msg = "unable to push key : {0} to bucket : {1} error : {2}" self.log.error(msg.format(key, client.vbucketId, error.status)) self.fail(msg.format(key, client.vbucketId, error.status)) self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry)) ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") for server in self.servers: shell = RemoteMachineShellConnection(server) output, error = shell.execute_command(self.perm_command) shell.log_command_output(output, error) node = RestConnection(server).get_nodes_self() BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder) shell.disconnect() for key in keys: try: client.replace(key, expiry, 0, "2") except mc_bin_client.MemcachedError as error: msg = "unable to replace key : {0} in bucket : {1} error : {2}" self.log.error(msg.format(key, client.vbucketId, error.status)) self.fail(msg.format(key, client.vbucketId, error.status)) self.log.info("replaced {0} keys with expiry set to {1}".format(len(keys), expiry)) for server in self.servers: BackupHelper(server, self).restore(self.remote_tmp_folder, overwrite_flag) time.sleep(10) self.log.info('verifying that all those keys...') for key in keys: if overwrite_flag: self.assertEqual("2", client.get(key=key), key + " should has value = 2") else: self.assertNotEqual("2", client.get(key=key), key + " should not has value = 2") self.log.info("verified that those keys inserted with expiry set to {0} have expired".format(expiry))
def wait_until_warmed_up(self, master=None): if not master: master = self.input.servers[0] bucket = self.param("bucket", "default") fn = RebalanceHelper.wait_for_mc_stats_no_timeout for bucket in self.buckets: RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_warmup_thread', 'complete', fn=fn)
def _test_cluster_topology_change_body(self): bucket = "default" BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket) self.assertTrue(ready, "wait_for_memcached failed") self.add_nodes_and_rebalance() rest = RestConnection(self.master) distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master], ram_load_ratio=1, value_size_distribution=distribution, moxi=True, write_only=True, number_of_threads=2) self.log.info("Sleep after data load") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") #let's create a unique folder in the remote location for server in self.servers: shell = RemoteMachineShellConnection(server) output, error = shell.execute_command(self.perm_command) shell.log_command_output(output, error) node = RestConnection(server).get_nodes_self() BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder) shell.disconnect() ClusterOperationHelper.cleanup_cluster(self.servers) BucketOperationHelper.delete_all_buckets_or_assert(self.servers, self) servers = [] for i in range(0, len(self.servers) - 1): servers.append(self.servers[i]) self.add_node_and_rebalance(servers[0], servers) BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket) self.assertTrue(ready, "wait_for_memcached failed") for server in self.servers: BackupHelper(server, self).restore(self.remote_tmp_folder) time.sleep(10) BucketOperationHelper.verify_data(self.master, inserted_keys, False, False, 11210, self)
def load_data(self, master, bucket, keys_count): log = logger.Logger.get_logger() inserted_keys_cnt = 0 while inserted_keys_cnt < keys_count: keys_cnt, rejected_keys_cnt = MemcachedClientHelper.load_bucket( servers=[master], name=bucket, number_of_items=keys_count, number_of_threads=5, write_only=True ) inserted_keys_cnt += keys_cnt log.info("wait until data is completely persisted on the disk") RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_queue_size", 0) RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_flusher_todo", 0) return inserted_keys_cnt
def load_data(master, bucket, keys_count=-1, load_ratio=-1): log = logger.Logger.get_logger() inserted_keys, rejected_keys =\ MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[master], name=bucket, ram_load_ratio=load_ratio, number_of_items=keys_count, number_of_threads=2, write_only=True) log.info("wait until data is completely persisted on the disk") RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_queue_size', 0) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_flusher_todo', 0) return inserted_keys
def test_vbucket_uuid(self): """ Test to show usage of vbucket information collection via api and than comparison and running the logic for analysis This is done for cluster and node level as well """ self.gen_create = BlobGenerator('loadOne', 'loadOne_', self.value_size, end=self.num_items) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=10, timeout_secs=60) self._wait_for_stats_all_buckets(self.servers) RebalanceHelper.wait_for_replication(self.servers, self.cluster) vbucket_stats=self.get_vbucket_seqnos(self.servers,self.buckets, perNode = True) logic,output = self.compare_per_node_maps(vbucket_stats) self.assertTrue(logic, output)
def wait_until_repl(self): print "[perf.repl] waiting for replication: %s"\ % time.strftime(PerfDefaults.strftime) master = self.input.servers[0] bucket = self.param("bucket", "default") RebalanceHelper.wait_for_stats_on_all(master, bucket, 'vb_replica_queue_size', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_replica_queue_itemondisk', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_rebalance_queue_backfillremaining', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_replica_qlen', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) print "[perf.repl] replication is done: %s"\ % time.strftime(PerfDefaults.strftime)
def load_data(self, master, bucket, keys_count): log = logger.Logger.get_logger() # gen_create = BlobGenerator("loadONE", "loadONE-", 256, start=0, end=keys_count) # BaseTestCase._load_all_buckets(master, gen_create, "create", 0) inserted_keys_cnt = 0 while inserted_keys_cnt < keys_count: keys_cnt, rejected_keys_cnt = MemcachedClientHelper.load_bucket( servers=[master], name=bucket, number_of_items=keys_count, number_of_threads=5, write_only=True ) inserted_keys_cnt += keys_cnt log.info("wait until data is completely persisted on the disk") RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_queue_size", 0) RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_flusher_todo", 0) return inserted_keys_cnt
def _test_view_on_multiple_docs(self, num_docs, params={"stale":"update_after"}, delay=10): self.log.info("description : create a view on {0} documents".format(num_docs)) master = self.servers[0] rest = RestConnection(master) bucket = "default" view_name = "dev_test_view_on_{1}_docs-{0}".format(str(uuid.uuid4())[:7], self.num_docs) map_fn = "function (doc) {if(doc.name.indexOf(\"" + view_name + "\") != -1) { emit(doc.name, doc);}}" rest.create_view(view_name, bucket, [View(view_name, map_fn, dev_view=False)]) self.created_views[view_name] = bucket rest = RestConnection(self.servers[0]) smart = VBucketAwareMemcached(rest, bucket) doc_names = [] prefix = str(uuid.uuid4())[:7] total_time = 0 self.log.info("inserting {0} json objects".format(num_docs)) for i in range(0, num_docs): key = doc_name = "{0}-{1}-{2}".format(view_name, prefix, i) doc_names.append(doc_name) value = {"name": doc_name, "age": 1000} smart.set(key, 0, 0, json.dumps(value)) self.log.info("inserted {0} json documents".format(len(doc_names))) time.sleep(10) results = ViewBaseTests._get_view_results(self, rest, bucket, view_name, len(doc_names), extra_params=params) view_time = results['view_time'] keys = ViewBaseTests._get_keys(self, results) RebalanceHelper.wait_for_persistence(master, bucket, 0) total_time = view_time # Keep trying this for maximum 5 minutes start_time = time.time() # increase timeout to 600 seconds for windows testing while (len(keys) != len(doc_names)) and (time.time() - start_time < 900): msg = "view returned {0} items , expected to return {1} items" self.log.info(msg.format(len(keys), len(doc_names))) self.log.info("trying again in {0} seconds".format(delay)) time.sleep(delay) results = ViewBaseTests._get_view_results(self, rest, bucket, view_name, len(doc_names), extra_params=params) view_time = results['view_time'] total_time += view_time keys = ViewBaseTests._get_keys(self, results) self.log.info("View time: {0} secs".format(total_time)) # Only if the lengths are not equal, look for missing keys if len(keys) != len(doc_names): not_found = list(set(doc_names) - set(keys)) ViewBaseTests._print_keys_not_found(self, not_found, 10) self.fail("map function did not return docs for {0} keys".format(len(not_found)))
def _test_backup_add_restore_bucket_with_expiration_key(self, replica): bucket = "default" rest = RestConnection(self.master) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi, replicaNumber=replica) BucketOperationHelper.wait_for_memcached(self.master, bucket) client = MemcachedClientHelper.direct_client(self.master, bucket) expiry = 60 test_uuid = uuid.uuid4() keys = ["key_%s_%d" % (test_uuid, i) for i in range(5000)] self.log.info("pushing keys with expiry set to {0}".format(expiry)) for key in keys: try: client.set(key, expiry, 0, key) except mc_bin_client.MemcachedError as error: msg = "unable to push key : {0} to bucket : {1} error : {2}" self.log.error(msg.format(key, client.vbucketId, error.status)) self.fail(msg.format(key, client.vbucketId, error.status)) client.close() self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry)) ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") node = RestConnection(self.master).get_nodes_self() output, error = self.shell.execute_command(self.perm_command) self.shell.log_command_output(output, error) backupHelper = BackupHelper(self.master, self) backupHelper.backup(bucket, node, self.remote_tmp_folder) BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi) BucketOperationHelper.wait_for_memcached(self.master, bucket) backupHelper.restore(self.remote_tmp_folder) time.sleep(60) client = MemcachedClientHelper.direct_client(self.master, bucket) self.log.info('verifying that all those keys have expired...') for key in keys: try: client.get(key=key) msg = "expiry was set to {0} but key: {1} did not expire after waiting for {2}+ seconds" self.fail(msg.format(expiry, key, expiry)) except mc_bin_client.MemcachedError as error: self.assertEquals(error.status, 1, msg="expected error code {0} but saw error code {1}".format(1, error.status)) client.close() self.log.info("verified that those keys inserted with expiry set to {0} have expired".format(expiry))
def _test_backup_and_restore_from_to_different_buckets(self): bucket_before_backup = "bucket_before_backup" bucket_after_backup = "bucket_after_backup" BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket_before_backup, port=11212, test_case=self) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket_before_backup) self.assertTrue(ready, "wait_for_memcached failed") self.add_nodes_and_rebalance() distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master], name=bucket_before_backup, ram_load_ratio=20, value_size_distribution=distribution, write_only=True, moxi=True, number_of_threads=2) self.log.info("Sleep after data load") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_before_backup, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_before_backup, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") for server in self.servers: shell = RemoteMachineShellConnection(server) output, error = shell.execute_command(self.perm_command) shell.log_command_output(output, error) node = RestConnection(server).get_nodes_self() BackupHelper(server, self).backup(bucket_before_backup, node, self.remote_tmp_folder) shell.disconnect() BucketOperationHelper.delete_bucket_or_assert(self.master, bucket_before_backup, self) BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket_after_backup, port=11212, test_case=self) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket_after_backup) self.assertTrue(ready, "wait_for_memcached failed") for server in self.servers: BackupHelper(server, self).restore(self.remote_tmp_folder, moxi_port=11212) time.sleep(10) ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_after_backup, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_after_backup, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") self.assertTrue(BucketOperationHelper.verify_data(self.master, inserted_keys, False, False, 11212, debug=False, bucket=bucket_after_backup), "Missing keys")
def test_failoverlogs_extraction_equals(self): """ Test to show usage of failover log collection via api and than comparison and running the logic for analysis This is done for cluster and node level as well """ self.gen_create = BlobGenerator('loadOne', 'loadOne_', self.value_size, end=self.num_items) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=10, timeout_secs=60) self._wait_for_stats_all_buckets(self.servers) RebalanceHelper.wait_for_replication(self.servers, self.cluster) failovers_stats=self.get_failovers_logs(self.servers,self.buckets, perNode = True) self.compare_failovers_logs(failovers_stats,self.servers,self.buckets, perNode = True) failovers_stats=self.get_failovers_logs(self.servers,self.buckets,perNode = False) self.compare_failovers_logs(failovers_stats,self.servers,self.buckets,perNode = False)
def verification_phase(test, master): # Stop loaders SwapRebalanceBase.stop_load(test.loaders) test.log.info("DONE DATA ACCESS PHASE") test.log.info("VERIFICATION PHASE") rest = RestConnection(master) servers_in_cluster = [] nodes = rest.get_nodes() for server in test.servers: for node in nodes: if node.ip == server.ip: servers_in_cluster.append(server) RebalanceHelper.wait_for_replication(servers_in_cluster, test.cluster_helper) SwapRebalanceBase.items_verification(test, master)
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) bucket_data = RebalanceBaseTest.bucket_data_init(rest) self.log.info("INTIAL LOAD") RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio, keys_count=self.keys_count) rebalance_out = False for server in self.servers[1:]: if rebalance_out: # Pick a node to rebalance out, other than master ejectedNodes = [RebalanceHelper.pick_node(master)] else: ejectedNodes = [] current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) self.log.info("adding node {0}, removing node {1} and rebalance afterwards".format(server.ip, [node.ip for node in ejectedNodes])) self.log.info("START PARALLEL LOAD") RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data, DELETE_RATIO=self.delete_ratio, ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio) self.log.info("INCREMENTAL REBALANCE IN/OUT") # rebalance in/out a server RebalanceTaskHelper.add_rebalance_task(self.task_manager, [master], [server], ejectedNodes, do_stop=self.do_stop) # wait for loading tasks to finish RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data) # Make sure we have at least 3 nodes, for replica=2 if len(current_nodes) > 2: rebalance_out = True if self.do_verify: self.log.info("VERIFICATION") RebalanceBaseTest.do_kv_and_replica_verification(master, self.task_manager, bucket_data, self.replica, self) else: self.log.info("NO VERIFICATION")
def populate_alternated(self, num_vbuckets, docs): """Every vBucket gets a doc first Populating the vBuckets alternated means that every vBucket gets a document first, before it receives the second one and so on. For example if we have 6 documents named doc-1 ... doc-6 and 3 vBuckets the result will be: vbucket-1: doc-1, doc-4 vbucket-2: doc-2, doc-5 vbucket-3: doc-3, doc-6 """ for i, doc in enumerate(docs): self.insert_into_vbucket(i % num_vbuckets, doc) RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
def rebalance_in_out_at_once_persistence_stopped(self): num_nodes_with_stopped_persistence = self.input.param("num_nodes_with_stopped_persistence", 1) servs_init = self.servers[:self.nodes_init] servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)] servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)] rest = RestConnection(self.master) self._wait_for_stats_all_buckets(servs_init) for server in servs_init[:min(num_nodes_with_stopped_persistence, self.nodes_init)]: shell = RemoteMachineShellConnection(server) for bucket in self.buckets: shell.execute_cbepctl(bucket, "stop", "", "", "") self.sleep(5) self.num_items_without_persistence = self.input.param("num_items_without_persistence", 100000) gen_extra = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2\ , end=self.num_items / 2 + self.num_items_without_persistence) self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()])) self.log.info("adding nodes {0} to cluster".format(servs_in)) self.log.info("removing nodes {0} from cluster".format(servs_out)) tasks = self._async_load_all_buckets(self.master, gen_extra, "create", 0, batch_size=1000) result_nodes = set(servs_init + servs_in) - set(servs_out) # wait timeout in 60 min because MB-7386 rebalance stuck self.cluster.rebalance(servs_init[:self.nodes_init], servs_in, servs_out, timeout=self.wait_timeout * 60) for task in tasks: task.result() self._wait_for_stats_all_buckets(servs_init[:self.nodes_init - self.nodes_out], \ ep_queue_size=self.num_items_without_persistence * 0.9, ep_queue_size_cond='>') self._wait_for_stats_all_buckets(servs_in) self._verify_all_buckets(self.master, timeout=None) self._verify_stats_all_buckets(result_nodes) #verify that curr_items_tot corresponds to sum of curr_items from all nodes verified = True for bucket in self.buckets: verified &= RebalanceHelper.wait_till_total_numbers_match(self.master, bucket) self.assertTrue(verified, "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total")
def test_rebalance_in_out_with_failover_addback_recovery(self): """ Rebalances nodes out and in with failover and full/delta recovery add back of a node Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. This test begins by loading a given number of items into the cluster. It then removes one node, rebalances that node out the cluster, and then rebalances it back in. During the rebalancing we update all of the items in the cluster. Once the node has been removed and added back we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We then remove and add back two nodes at a time and so on until we have reached the point where we are adding back and removing at least half of the nodes. """ recovery_type = self.input.param("recoveryType", "full") gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) result_nodes = list( set(self.servers[:self.nodes_init] + servs_in) - set(servs_out)) for node in servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) # Mark Node for failover self.sleep(30) success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType=recovery_type) self.sleep(30) try: self.shuffle_nodes_between_zones_and_rebalance(servs_out) except Exception as e: if "deltaRecoveryNotPossible" not in e.__str__(): self.fail( "Rebalance did not fail. Rebalance has to fail since no delta recovery should be possible" " while adding nodes too")
def test_rebalance_in_out_at_once_persistence_stopped(self): """ PERFORMANCE:Rebalance in/out at once with stopped persistence. This test begins by loading a given number of items into the cluster with self.nodes_init nodes in it. Then we stop persistence on some nodes. Test starts to update some data and load new data in the cluster. At that time we add servs_in nodes and remove servs_out nodes and start rebalance. After rebalance and data ops are completed we start verification phase: wait for the disk queues to drain, verify the number of items that were/or not persisted with expected values, verify that there has been no data loss, sum(curr_items) match the curr_items_total.Once All checks passed, test is finished. Available parameters by default are: nodes_init=1, nodes_in=1, nodes_out=1, num_nodes_with_stopped_persistence=1 num_items_without_persistence=100000 """ num_nodes_with_stopped_persistence = self.input.param( "num_nodes_with_stopped_persistence", 1) servs_init = self.servers[:self.nodes_init] servs_in = [ self.servers[i + self.nodes_init] for i in range(self.nodes_in) ] servs_out = [ self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out) ] rest = RestConnection(self.master) self._wait_for_stats_all_buckets(servs_init) for server in servs_init[:min(num_nodes_with_stopped_persistence, self. nodes_init)]: shell = RemoteMachineShellConnection(server) for bucket in self.buckets: shell.execute_cbepctl(bucket, "stop", "", "", "") self.sleep(5) self.num_items_without_persistence = self.input.param( "num_items_without_persistence", 100000) gen_extra = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items // 2, end=self.num_items // 2 + self.num_items_without_persistence) self.log.info("current nodes : {0}".format( [node.id for node in rest.node_statuses()])) self.log.info("adding nodes {0} to cluster".format(servs_in)) self.log.info("removing nodes {0} from cluster".format(servs_out)) tasks = self._async_load_all_buckets(self.master, gen_extra, "create", 0, batch_size=1000) result_nodes = set(servs_init + servs_in) - set(servs_out) # wait timeout in 60 min because MB-7386 rebalance stuck self.cluster.rebalance( servs_init[:self.nodes_init], servs_in, servs_out, timeout=self.wait_timeout * 60, sleep_before_rebalance=self.sleep_before_rebalance) for task in tasks: task.result() # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption() self._wait_for_stats_all_buckets( servs_init[:self.nodes_init - self.nodes_out], ep_queue_size=self.num_items_without_persistence * 0.9, ep_queue_size_cond='>') self._wait_for_stats_all_buckets(servs_in) self._verify_all_buckets(self.master, timeout=None) self._verify_stats_all_buckets(result_nodes) # verify that curr_items_tot corresponds to sum of curr_items from all nodes verified = True for bucket in self.buckets: verified &= RebalanceHelper.wait_till_total_numbers_match( self.master, bucket) self.assertTrue( verified, "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total" ) self.verify_unacked_bytes_all_buckets()
def _do_warmup(self, howmany, timeout_in_seconds=1800): # max_time is in micro seconds self._insert_data(howmany) if int(howmany) < 50: self.log.info("sleep 10 seconds for small number items insert correctly into bucket") time.sleep(10) curr_items = int(self.onenodemc.stats()["curr_items"]) uptime = int(self.onenodemc.stats()["uptime"]) RebalanceHelper.wait_for_persistence(self.master, "default") self.log.info("sleeping for 10 seconds") time.sleep(10) rest = RestConnection(self.master) command = "try ns_server_testrunner_api:kill_memcached(20000) catch _:_ -> [erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)] end." memcached_restarted, content = rest.diag_eval(command) self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval") #wait until memcached starts start = time.time() memcached_restarted = False while time.time() - start < 60: try: self.onenodemc = MemcachedClientHelper.direct_client(self.master, "default") value = int(self.onenodemc.stats()["uptime"]) if value < uptime: self.log.info("memcached restarted...") memcached_restarted = True break self.onenodemc.close() # The uptime stat have a 1 sec resolution so there is no point of # retrying more often time.sleep(1) except Exception: time.sleep(1) self.assertTrue(memcached_restarted, "memcached restarted and uptime is now reset") # Warmup till curr_items match self.onenodemc = MemcachedClientHelper.direct_client(self.master, "default") stats = self.onenodemc.stats() present_count = int(stats["curr_items"]) ep_warmup_thread = stats["ep_warmup_thread"] self.log.info("ep curr_items : {0}, inserted_items {1} directly after kill_memcached ".format(present_count, curr_items)) self.log.info("ep_warmup_thread directly after kill_memcached: {0}".format(ep_warmup_thread)) start = time.time() while ep_warmup_thread != "complete": if (time.time() - start) <= timeout_in_seconds: stats = self.onenodemc.stats() present_count = int(stats["curr_items"]) ep_warmup_thread = stats["ep_warmup_thread"] self.log.warn("curr_items {0}, ep_warmup_thread {1}".format(present_count, ep_warmup_thread)) time.sleep(1) else: self.fail("Timed out waiting for warmup") stats = self.onenodemc.stats() present_count = int(stats["curr_items"]) if present_count < curr_items: self.log.error("Warmup failed. Got {0} and expected {1} items".format(present_count, curr_items)) self.fail("Warmup failed. Incomplete number of messages after killing memcached") if "ep_warmup_time" not in stats: self.log.error("'ep_warmup_time' was not found in stats:{0}".format(stats)) warmup_time = int(stats["ep_warmup_time"]) self.log.info("ep_warmup_time is {0}".format(warmup_time))
def test_rebalance_inout_with_durability_failure(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add nodes using 'self.nodes_in' such that, replica_number > nodes_in_cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance 5. Add make a node and do CRUD on the bucket 6. Verify durability works after node addition Note: This is a Negative case. i.e: Durability will be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # TODO: Enable verification """ vbucket_info_dict = dict() # Cb stat object for verification purpose master_shell_conn = RemoteMachineShellConnection(master) master_node_cb_stat = Cbstats(master_shell_conn) # Update each vbucket's seq_no for latest value for verification for vb_num in range(0, self.vbuckets): vbucket_info_dict[vb_num] = master_node_cb_stat.vbucket_seqno( def_bucket.name, vb_num, "abs_high_seqno") """ # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # TODO: There will be failure in doc_count verification due to # swap_rebalance. Need to update verification steps accordingly to # satisfy this self.verification_phase() # Add back first ejected node back into the cluster self.task.rebalance(self.cluster.nodes_in_cluster, [toBeEjectedNodes[0]], []) # Load doc into all vbuckets to verify durability gen_create = doc_generator('test_', 0, self.num_items) task = self.task.async_load_gen_docs_atomicity( self.cluster, def_bucket, gen_create, self.op_type, exp=0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit) self.task_manager.get_task_result(task)
def end_rebalance(master): RebalanceHelper.end_rebalance(master)
def run_test(self): ep_threshold = self.input.param("ep_threshold", "ep_mem_low_wat") active_resident_threshold = int( self.input.param("active_resident_threshold", 10)) mc = MemcachedClientHelper.direct_client(self.servers[0], self.bucket_name) stats = mc.stats() threshold = int(self.input.param('threshold', stats[ep_threshold])) threshold_reached = False self.num_items = self.input.param("items", 10000) self._load_doc_data_all_buckets('create') # load items till reached threshold or mem-ratio is less than resident ratio threshold while not threshold_reached: mem_used = int(mc.stats()["mem_used"]) if mem_used < threshold or int(mc.stats( )["vb_active_perc_mem_resident"]) >= active_resident_threshold: self.log.info( "mem_used and vb_active_perc_mem_resident_ratio reached at %s/%s and %s " % (mem_used, threshold, mc.stats()["vb_active_perc_mem_resident"])) items = self.num_items self.num_items += self.input.param("items", 10000) self._load_doc_data_all_buckets('create', items) else: threshold_reached = True self.log.info("DGM state achieved!!!!") # wait for draining of data before restart and warm up for bucket in self.buckets: RebalanceHelper.wait_for_persistence(self.nodes_server[0], bucket, bucket_type=self.bucket_type) while True: # read_data_task = self.cluster.async_verify_data(self.master, self.buckets[0], self.buckets[0].kvs[1]) read_data_task = Thread(target=self._run_get) read_data_task.start() #5 threads to run stats all and reset asynchronously start = time.time() while (time.time() - start) < 300: stats_all_thread = [] stats_reset_thread = [] for i in range(self.threads_to_run): stat_str = '' stats_all_thread.append( Thread(target=self._get_stats, args=[stat_str])) stats_all_thread[i].start() stat_str = 'reset' stats_reset_thread.append( Thread(target=self._get_stats, args=[stat_str])) stats_reset_thread[i].start() for i in range(self.threads_to_run): stats_all_thread[i].join() stats_reset_thread[i].join() del stats_all_thread del stats_reset_thread # read_data_task.result() read_data_task.join()
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format( RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( new_swap_servers)) self.verification_phase()
def rebalance_in(servers, how_many, monitor=True): return RebalanceHelper.rebalance_in(servers, how_many, monitor)
def rebalance_out_with_failover(self): self.transaction_timeout = self.input.param("transaction_timeout", 100) self.transaction_commit = self.input.param("transaction_commit", True) task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, self.gen_load, "create", 0, batch_size=20, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level) self.task.jython_task_manager.get_task_result(task) self.sleep(60, "Task completed") fail_over = self.input.param("fail_over", False) self.rest = RestConnection(self.cluster.master) std = self.std_vbucket_dist or 1.0 gen_delete = self.get_doc_generator(self.num_items / 2, self.num_items) gen_create = self.get_doc_generator(self.num_items + 1, self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks.append( self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, self.gen_update, "rebalance_update", 0, batch_size=20, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level)) if ("create" in self.doc_ops): tasks.append( self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_create, "create", 0, batch_size=20, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level)) self.num_items = self.num_items + 1 + (self.num_items * 3 / 2) if ("delete" in self.doc_ops): tasks.append( self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_delete, "rebalance_delete", 0, batch_size=20, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level)) self.num_items = self.num_items - (self.num_items / 2) for task in tasks: self.task.jython_task_manager.get_task_result(task) ejectedNode = self.cluster_util.find_node_info( self.cluster.master, self.cluster.servers[self.nodes_init - 1]) self.sleep(100, "Sleep for 100 seconds") prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.servers[:self.nodes_init], self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos( self.cluster.servers[:self.nodes_init], self.bucket_util.buckets) record_data_set = self.bucket_util.get_data_set_all( self.cluster.servers[:self.nodes_init], self.bucket_util.buckets) self.bucket_util.compare_vbucketseq_failoverlogs( prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.cluster.master) chosen = RebalanceHelper.pick_nodes(self.cluster.master, howmany=1) new_server_list = self.cluster_util.add_remove_servers( self.cluster.servers, self.cluster.servers[:self.nodes_init], [self.cluster.servers[self.nodes_init - 1], chosen[0]], []) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self.cluster.nodes_in_cluster = new_server_list self.sleep(60, "Starting data_analaysis_all") self.bucket_util.data_analysis_all(record_data_set, new_server_list, self.bucket_util.buckets) self.sleep(60, "Vb_Distribution_Analysis starts") nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=self.num_replicas, std=1.0, total_vbuckets=self.vbuckets)
def test_rebalance_out(self): RebalanceBaseTest.common_setup(self._input, self, replica=1) log = logger.Logger().get_logger() master = self._servers[0] num_of_docs = TestInputSingleton.input.param("num_of_docs", 100000) replica = TestInputSingleton.input.param("replica", 100000) add_items_count = TestInputSingleton.input.param( "num_of_creates", 30000) size = TestInputSingleton.input.param("item_size", 256) params = { "sizes": [size], "count": num_of_docs, "seed": str(uuid.uuid4())[:7] } rest = RestConnection(master) buckets = rest.get_buckets() bucket_data = {} generators = {} for bucket in buckets: bucket_data[bucket.name] = {"kv_store": ClientKeyValueStore()} rebalanced_in, which_servers = RebalanceBaseTest.rebalance_in( self._servers, len(self.servers) - 1) self.assertTrue(rebalanced_in, msg="unable to add and rebalance more nodes") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding nodes {0}".format( [node.id for node in rest.node_statuses()])) while len(rest.node_statuses()) > 1: #pick a node that is not the master node toBeEjectedNode = RebalanceHelper.pick_node(master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[toBeEjectedNode.id]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNode.id)) for bucket in buckets: kv_store = bucket_data[bucket.name]["kv_store"] add_items_seed = str(uuid.uuid4())[:7] self._add_items(add_items_seed, bucket, add_items_count, kv_store) errors = RebalanceDataGenerator.do_verification( kv_store, rest, bucket.name) if errors: log.error("verification returned {0} errors".format( len(errors))) load_set_ops = {"ops": "set", "bucket": bucket.name} load_set_ops.update(params) load_delete_ops = { "ops": "delete", "bucket": bucket.name, "sizes": [size], "count": add_items_count / 5, "seed": add_items_seed } thread = RebalanceDataGenerator.start_load( rest, bucket.name, RebalanceDataGenerator.create_loading_tasks(load_set_ops), kv_store) generators["set"] = {"thread": thread} #restart three times generators["set"]["thread"].start() thread = RebalanceDataGenerator.start_load( rest, bucket.name, RebalanceDataGenerator.create_loading_tasks( load_delete_ops), kv_store) generators["delete"] = {"thread": thread} generators["delete"]["thread"].start() self.log.info("current nodes : {0}".format( [node.id for node in rest.node_statuses()])) for bucket in buckets: kv_store = bucket_data[bucket.name]["kv_store"] errors = RebalanceDataGenerator.do_verification( kv_store, rest, bucket.name) if errors: log.error("verification returned {0} errors".format( len(errors))) generators["set"]["thread"].join() generators["delete"]["thread"].join() for bucket in buckets: kv_store = bucket_data[bucket.name]["kv_store"] bucket_data[bucket.name]["items_inserted_count"] = len( kv_store.valid_items()) RebalanceBaseTest.replication_verification( master, bucket_data, replica, self)
def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replication, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info( "Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error( "Graceful failover can't be applied to nodes with version less then 3.*" ) self.log.error( "Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform view creation tasks and wait for completion before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Take snap-shot of data set used for validaiton record_static_data_set = dict() prev_vbucket_stats = dict() prev_failover_stats = dict() if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # TODO: Enable this even when 'flusher_batch_split_trigger' is not set if self.flusher_batch_split_trigger and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets(self.master, self.gen_update, "update", 0) for task in tasks: task.result() if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Add back + rebalance / only rebalance with verification if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() if self.during_ops is None: self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list, master_node=self.master)
def _test_backup_add_restore_bucket_body(self, bucket, delay_after_data_load, startup_flag, single_node): server = self.master rest = RestConnection(server) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) if bucket == "default": rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi) else: proxyPort = info.moxi + 500 rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=proxyPort, authType="sasl", saslPassword="******") ready = BucketOperationHelper.wait_for_memcached(server, bucket) self.assertTrue(ready, "wait_for_memcached failed") if not single_node: self.add_nodes_and_rebalance() distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master], name=bucket, ram_load_ratio=1, value_size_distribution=distribution, moxi=True, write_only=True, number_of_threads=2) if not single_node: rest = RestConnection(self.master) self.assertTrue(RestHelper(rest).wait_for_replication(180), msg="replication did not complete") self.log.info("Sleep {0} seconds after data load".format(delay_after_data_load)) ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") node = RestConnection(self.master).get_nodes_self() if not startup_flag: for server in self.servers: shell = RemoteMachineShellConnection(server) shell.stop_membase() shell.stop_couchbase() shell.disconnect() output, error = self.shell.execute_command(self.perm_command) self.shell.log_command_output(output, error) #now let's back up BackupHelper(self.master, self).backup(bucket, node, self.remote_tmp_folder) if not startup_flag: for server in self.servers: shell = RemoteMachineShellConnection(server) shell.start_membase() shell.start_couchbase() RestHelper(RestConnection(server)).is_ns_server_running() shell.disconnect() BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) if bucket == "default": rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi) else: proxyPort = info.moxi + 500 rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=proxyPort, authType="sasl", saslPassword="******") BucketOperationHelper.wait_for_memcached(self.master, bucket) if bucket == "default": BackupHelper(self.master, self).restore(backup_location=self.remote_tmp_folder, moxi_port=info.moxi) else: BackupHelper(self.master, self).restore(backup_location=self.remote_tmp_folder, moxi_port=info.moxi, username=bucket, password='******') keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4) self.assertTrue(keys_exist, msg="unable to verify keys after restore")
def test_start_stop_rebalance_after_failover(self): """ Rebalances nodes out and in with failover Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. The test begin with loading the bucket with given number of items. It then fails over a node. We then rebalance the cluster, while adding or removing given number of nodes. Once the rebalance reaches 50%, we stop the rebalance and validate the cluster stats. We then restart the rebalance and validate rebalance was completed successfully. """ fail_over = self.input.param("fail_over", False) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) # Validate seq_no snap_start/stop values before rebalance self.check_snap_start_corruption() prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) _, _ = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) result_nodes = list( set(self.servers[:self.nodes_init] + self.servs_in) - set(self.servs_out)) for node in self.servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) # Doc_mutation after failing over the nodes tasks = self._async_load_all_buckets(self.master, gen, "update", 0) for task in tasks: task.result(self.wait_timeout * 20) # Validate seq_no snap_start/stop values after failover self.check_snap_start_corruption() self.cluster.async_rebalance(self.servers[:self.nodes_init], self.servs_in, self.servs_out) expected_progress = 50 rest = RestConnection(self.master) reached = RestHelper(rest).rebalance_reached(expected_progress) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format(expected_progress)) if not RestHelper(rest).is_cluster_rebalanced(): self.log.info("Stop the rebalance") stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="Unable to stop rebalance") self._verify_all_buckets(self.master, timeout=None, max_verify=self.max_verify, batch_size=1) self.shuffle_nodes_between_zones_and_rebalance() self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True, check_bucket_stats=False) self.sleep(30) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets) # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption()
def verify_data(master, inserted_keys, bucket, test): log = logger.Logger.get_logger() log.info("Verifying data") ready = RebalanceHelper.wait_for_persistence(master, bucket) BucketOperationHelper.keys_exist_or_assert_in_parallel(keys=inserted_keys, server=master, bucket_name=bucket, test=test, concurrency=4)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if self.cluster_run: if server.port not in [node.port for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) else: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) if self.fail_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in nodes: if isinstance(server.ip, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if self.cluster_run: if server.port not in [serv.port for serv in add_back_servers]: final_add_back_servers.append(server) else: if server.ip not in [serv.ip for serv in add_back_servers]: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( add_back_servers)) self.verification_phase()
def rebalance_in(servers, how_many): return RebalanceHelper.rebalance_in(servers, how_many)
def test_rebalance_inout_with_durability_check(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add 'self.nodes_in' nodes into the cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance Note: This is a Positive case. i.e: Durability should not be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # Update replica value before performing rebalance in/out if self.replica_to_update: bucket_helper = BucketHelper(self.cluster.master) # Recalculate replicate_to/persist_to as per new replica value if self.self.durability_level is None: self.replicate_to = floor(self.replica_to_update / 2) + 1 self.persist_to = floor(self.replica_to_update / 2) + 2 # Update bucket replica to new value as given in conf file self.log.info("Updating replica count of bucket to {0}".format( self.replica_to_update)) bucket_helper.change_bucket_props( def_bucket.name, replicaNumber=self.replica_to_update) # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) # start load, max_ops_per_second is the combined limit for all buckets buckets = rest.get_buckets() loaders = [] self.log.info("max-ops-per-second per bucket: {0}".format(self.max_ops_per_second / len(buckets))) for bucket in buckets: loader = {} loader["mcsoda"] = LoadWithMcsoda(master, self.keys_count, prefix='', bucket=bucket.name, password=bucket.saslPassword, protocol='membase-binary') loader["mcsoda"].cfg["max-ops"] = 0 loader["mcsoda"].cfg["max-ops-per-sec"] = self.max_ops_per_second / len(buckets) loader["mcsoda"].cfg["exit-after-creates"] = 0 loader["mcsoda"].cfg["min-value-size"] = self.min_item_size loader["mcsoda"].cfg["json"] = 0 loader["mcsoda"].cfg["batch"] = 100 loader["thread"] = Thread(target=loader["mcsoda"].load_data, name='mcloader_' + bucket.name) loader["thread"].daemon = True loaders.append(loader) for loader in loaders: loader["thread"].start() for iteration in range(self.repeat): for server in self.servers[1:]: self.log.info("iteration {0}: ".format(iteration)) self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) self.log.info("adding node {0} and rebalance afterwards".format(server.ip)) rebalance_done = False rebalance_try = 0 while not rebalance_done: try: ClusterOperationHelper.begin_rebalance_in(master, [server]) ClusterOperationHelper.end_rebalance(master) rebalance_done = True except AssertionError as e: rebalance_try += 1 self.log.error(e) time.sleep(5) if rebalance_try > 5: raise e for server in self.servers[1:]: self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) self.log.info("removing node {0} and rebalance afterwards".format(server.ip)) rebalance_done = False rebalance_try = 0 while not rebalance_done: try: ClusterOperationHelper.begin_rebalance_out(master, [server]) ClusterOperationHelper.end_rebalance(master) rebalance_done = True except AssertionError as e: rebalance_try += 1 self.log.error(e) time.sleep(5) if rebalance_try > 5: raise e # stop load for loader in loaders: loader["mcsoda"].load_stop() for loader in loaders: loader["thread"].join()
def begin_rebalance_out(master, servers, timeout=5): RebalanceHelper.begin_rebalance_out(master, servers, timeout)
def test_backup_upgrade_restore_default(self): if len(self.servers) < 2: self.log.error("At least 2 servers required for this test ..") return original_set = copy.copy(self.servers) worker = self.servers[len(self.servers) - 1] self.servers = self.servers[:len(self.servers) - 1] shell = RemoteMachineShellConnection(self.master) o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt") fin = o[0] shell.disconnect() initial_version = self.input.param("initial_version", fin) final_version = self.input.param("final_version", fin) if initial_version == final_version: self.log.error("Same initial and final versions ..") return if not final_version.startswith('2.0'): self.log.error("Upgrade test not set to run from 1.8.1 -> 2.0 ..") return builds, changes = BuildQuery().get_all_builds(version=final_version) product = 'couchbase-server-enterprise' #CASE where the worker isn't a 2.0+ worker_flag = 0 shell = RemoteMachineShellConnection(worker) o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt") temp = o[0] if not temp.startswith('2.0'): worker_flag = 1 if worker_flag == 1: self.log.info("Loading version {0} on worker.. ".format(final_version)) remote = RemoteMachineShellConnection(worker) info = remote.extract_remote_info() older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, final_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) remote.disconnect() remote_tmp = "{1}/{0}".format("backup", "/root") perm_comm = "mkdir -p {0}".format(remote_tmp) if not initial_version == fin: for server in self.servers: remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() self.log.info("Loading version .. {0}".format(initial_version)) older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, initial_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() self.common_setUp() bucket = "default" if len(self.servers) > 1: self.add_nodes_and_rebalance() rest = RestConnection(self.master) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) rest.create_bucket(bucket, ramQuotaMB=size) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket) self.assertTrue(ready, "wait_for_memcached_failed") distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master], name=bucket, ram_load_ratio=0.5, value_size_distribution=distribution, moxi=True, write_only=True, delete_ratio=0.1, number_of_threads=2) if len(self.servers) > 1: rest = RestConnection(self.master) self.assertTrue(RebalanceHelper.wait_for_replication(rest.get_nodes(), timeout=180), msg="replication did not complete") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") node = RestConnection(self.master).get_nodes_self() shell = RemoteMachineShellConnection(worker) o, r = shell.execute_command(perm_comm) shell.log_command_output(o, r) shell.disconnect() #Backup #BackupHelper(self.master, self).backup(bucket, node, remote_tmp) shell = RemoteMachineShellConnection(worker) shell.execute_command("/opt/couchbase/bin/cbbackup http://{0}:{1} {2}".format( self.master.ip, self.master.port, remote_tmp)) shell.disconnect() BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) time.sleep(30) #Upgrade for server in self.servers: self.log.info("Upgrading to current version {0}".format(final_version)) remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() new_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, final_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(new_build) remote.install_server(new_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() time.sleep(30) #Restore rest = RestConnection(self.master) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) rest.create_bucket(bucket, ramQuotaMB=size) ready = BucketOperationHelper.wait_for_memcached(server, bucket) self.assertTrue(ready, "wait_for_memcached_failed") #BackupHelper(self.master, self).restore(backup_location=remote_tmp, moxi_port=info.moxi) shell = RemoteMachineShellConnection(worker) shell.execute_command("/opt/couchbase/bin/cbrestore {2} http://{0}:{1} -b {3}".format( self.master.ip, self.master.port, remote_tmp, bucket)) shell.disconnect() time.sleep(60) keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4) self.assertTrue(keys_exist, msg="unable to verify keys after restore") time.sleep(30) BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) rest = RestConnection(self.master) helper = RestHelper(rest) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id if len(self.servers) > 1: removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=True) shell = RemoteMachineShellConnection(worker) shell.remove_directory(remote_tmp) shell.disconnect() self.servers = copy.copy(original_set) if initial_version == fin: builds, changes = BuildQuery().get_all_builds(version=initial_version) for server in self.servers: remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() self.log.info("Loading version .. {0}".format(initial_version)) older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, initial_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect()
def test_rebalance_in_out_with_failover(self): """ Rebalances nodes out and in with failover Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. This test begins by loading a given number of items into the cluster. It then removes one node, rebalances that node out the cluster, and then rebalances it back in. During the rebalancing we update all of the items in the cluster. Once the node has been removed and added back we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We then remove and add back two nodes at a time and so on until we have reached the point where we are adding back and removing at least half of the nodes. """ fail_over = self.input.param("fail_over", False) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) # Validate seq_no snap_start/stop values after initial doc_load self.check_snap_start_corruption() self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) result_nodes = list( set(self.servers[:self.nodes_init] + servs_in) - set(servs_out)) for node in servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) # Load data after add-node self._load_all_buckets(self.master, gen, "update", 0) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) # Load data after failover self._load_all_buckets(self.master, gen, "update", 0) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # No need to pass self.sleep_before_rebalance, # since prev ops are synchronous call self.shuffle_nodes_between_zones_and_rebalance(servs_out) # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption() self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True) self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) bucket_data = RebalanceBaseTest.bucket_data_init(rest) # add all servers self.log.info("Initially rebalancing in the nodes") RebalanceTaskHelper.add_rebalance_task(self.task_manager, [master], self.servers[1:], [], monitor=True, do_stop=self.do_stop) self.log.info("Initial loading of data") RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio, keys_count=self.keys_count) nodes = rest.node_statuses() for node in nodes[1:]: # Get the current cluster size, we will continnue fail-over till current_cluster_size= replica+1 current_cluster_len = len(rest.node_statuses()) if current_cluster_len < (self.replica + 1): self.log.info( "Replica count {0} is greater than the current cluster-size{1}, stopping failover test.".format( self.replica, current_cluster_len)) else: # Never pick master node if node.ip != master.ip: self.log.info("Starting Parallel Load ..") RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data, DELETE_RATIO=self.delete_ratio, ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio) # Pick a Node to failover toBeEjectedNode = RebalanceHelper.pick_node(master) self.log.info("Starting Failover and Rebalance Out for node {0}:{1}".format(toBeEjectedNode.ip, toBeEjectedNode.port)) # rebalance Out RebalanceTaskHelper.add_failover_task(self.task_manager, [master], [toBeEjectedNode], True) self.log.info( "Completed Failover for node {0}:{1}".format(toBeEjectedNode.ip, toBeEjectedNode.port)) # rebalance Out RebalanceTaskHelper.add_rebalance_task(self.task_manager, [master], [], [toBeEjectedNode], do_stop=self.do_stop, monitor=True) # wait for all tasks to finish RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data) self.log.info("Completed Load, Failover and Rebalance Out. ") # verification step if self.do_verify: self.log.info("Verifying with KV-store") RebalanceBaseTest.do_kv_and_replica_verification(master, self.task_manager, bucket_data, self.replica, self, failed_over=True) else: self.log.info("No verification with KV-store specified") # at least 2 nodes required per loop to rebalance out and verify replication self.log.info("Completed Load and Rebalance-Out")
def common_test_body(self, keys_count, replica, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(replica)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self._servers) _servers_ = self._servers rest = RestConnection(self.master) nodes = rest.node_statuses() self._wait_for_replication(self._servers, timeout=600) chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info( "10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue( RestHelper(rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall( self._servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status( node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self._servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.assertTrue( status, msg= "node status is not unhealthy even after waiting for 5 minutes" ) failed_over = rest.fail_over(node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue( failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") self._wait_for_stats_all_buckets(_servers_) self._wait_for_replication(self._servers, timeout=600) self._verify_stats_all_buckets(_servers_) self._verify_all_buckets(self.master)
def rebalance_in_with_failover(self): fail_over = self.input.param("fail_over", False) gen_update = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) tasks = [] tasks += self._async_load_all_buckets(self.master, gen_update, "update", 0) for task in tasks: task.result() servs_in = [ self.servers[i + self.nodes_init] for i in range(self.nodes_in) ] self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) self.rest = RestConnection(self.master) self.rest.add_node(self.master.rest_username, self.master.rest_password, self.servers[self.nodes_init].ip, self.servers[self.nodes_init].port) # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) if fail_over: self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Graceful Failover Failed") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance Failed") # Verification new_server_list = self.add_remove_servers( self.servers, self.servers[:self.nodes_init], [chosen[0]], [self.servers[self.nodes_init]]) self._verify_stats_all_buckets(new_server_list, timeout=120) self.verify_cluster_stats(new_server_list, check_ep_items_remaining=True) self.compare_failovers_logs(prev_failover_stats, new_server_list, self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, new_server_list, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def test_getr(self): item_count = self.input.param("item_count", 10000) replica_count = self.input.param("replica_count", 1) expiration = self.input.param("expiration", 0) delay = float(self.input.param("delay", 0)) eject = self.input.param("eject", 0) delete = self.input.param("delete", 0) mutate = self.input.param("mutate", 0) warmup = self.input.param("warmup", 0) skipload = self.input.param("skipload", 0) rebalance = self.input.param("rebalance", 0) negative_test = False if delay > expiration: negative_test = True if delete and not mutate: negative_test = True if skipload and not mutate: negative_test = True prefix = str(uuid.uuid4())[:7] BucketOperationHelper.delete_all_buckets_or_assert([self.master], self) BucketOperationHelper.create_bucket(self.master, name=self.default_bucket_name, replica=replica_count, port=11210, test_case=self, bucket_ram=-1, password="") if rebalance == GetrTests.DURING_REBALANCE or rebalance == GetrTests.AFTER_REBALANCE: # leave 1 node unclustered for rebalance in ClusterOperationHelper.begin_rebalance_out(self.master, self.servers[-1:]) ClusterOperationHelper.end_rebalance(self.master) ClusterOperationHelper.begin_rebalance_in(self.master, self.servers[:-1]) ClusterOperationHelper.end_rebalance(self.master) else: ClusterOperationHelper.begin_rebalance_in(self.master, self.servers) ClusterOperationHelper.end_rebalance(self.master) vprefix = "" if not skipload: self._load_items(item_count=item_count, expiration=expiration, prefix=prefix, vprefix=vprefix) if not expiration: RebalanceHelper.wait_for_stats_int_value(self.master, self.default_bucket_name, "curr_items_tot", item_count * (replica_count + 1), "<=", 600, True) if delete: self._delete_items(item_count=item_count, prefix=prefix) if mutate: vprefix = "mutated" self._load_items(item_count=item_count, expiration=expiration, prefix=prefix, vprefix=vprefix) self.assertTrue(RebalanceHelper.wait_for_replication(self.rest.get_nodes(), timeout=180), msg="replication did not complete") if eject: self._eject_items(item_count=item_count, prefix=prefix) if delay: self.sleep(delay) if rebalance == GetrTests.DURING_REBALANCE: ClusterOperationHelper.begin_rebalance_in(self.master, self.servers) if rebalance == GetrTests.AFTER_REBALANCE: ClusterOperationHelper.end_rebalance(self.master) if warmup: self.log.info("restarting memcached") command = "rpc:multicall(erlang, apply, [fun () -> try ns_server_testrunner_api:restart_memcached(20000) catch _:_ -> ns_port_sup:restart_port_by_name(memcached) end end, []], 20000)." memcached_restarted, content = self.rest.diag_eval(command) #wait until memcached starts self.assertTrue(memcached_restarted, "unable to restart memcached process through diag/eval") RebalanceHelper.wait_for_stats(self.master, self.default_bucket_name, "curr_items_tot", item_count * (replica_count + 1), 600) count = self._getr_items(item_count=item_count, replica_count=replica_count, prefix=prefix, vprefix=vprefix) if negative_test: self.assertTrue(count == 0, "found {0} items, expected none".format(count)) else: self.assertTrue(count == replica_count * item_count, "expected {0} items, got {1} items".format(replica_count * item_count, count)) if rebalance == GetrTests.DURING_REBALANCE: ClusterOperationHelper.end_rebalance(self.master)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.cluster.master rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[1:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status = self.task.rebalance(self.cluster.servers[:self.nodes_init], intial_severs, []) self.assertTrue(status, msg="Rebalance was failed") self.log.info("CREATE BUCKET PHASE") self.create_buckets() self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) time.sleep(1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") time.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 #we expect that rebalance will be failed while rest._rebalance_progress_status() == "running" and i < 60: self.log.info("rebalance progress: {0}".format(rest._rebalance_progress())) time.sleep(1) i += 1 self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status())) knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.sleep(10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = self.bucket_util.buckets[0] pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for _ in xrange(times): try: shell = RemoteMachineShellConnection(server) pid = shell.get_memcache_pid() shell.disconnect() break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) self.sleep(2) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") self.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: self.bucket_util._wait_warmup_completed([master], bucket, wait_time=600) # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") self.sleep(30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue( rest.monitorRebalance(), msg="Rebalance failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") self.verification_phase()
def wait_for_persistence(self, timeout=120): RebalanceHelper.wait_for_persistence(self.master, self.bucket, timeout)
def do_warmup(self): howmany = self.num_of_docs self.input = TestInputSingleton.input self.servers = self.input.servers self._insert_data(howmany) RebalanceHelper.wait_for_stats_on_all(self.master, "default", "ep_queue_size", 0) RebalanceHelper.wait_for_stats_on_all(self.master, "default", "ep_flusher_todo", 0) time.sleep(5) rest = RestConnection(self.master) map = {} #collect curr_items from all nodes for server in self.servers: mc_conn = MemcachedClientHelper.direct_client(server, "default") map["{0}:{1}".format(server.ip, server.port)] = {} map["{0}:{1}".format( server.ip, server.port)]["curr_items_tot"] = mc_conn.stats( "")["curr_items_tot"] map["{0}:{1}".format( server.ip, server.port)]["previous_uptime"] = mc_conn.stats("")["uptime"] self.log.info("memcached {0}:{1} has {2} items".format( server.ip, server.port, mc_conn.stats("")["curr_items_tot"])) mc_conn.close() # Killing Memcached nodes = rest.node_statuses() for node in nodes: _node = { "ip": node.ip, "port": node.port, "username": self.servers[0].rest_username, "password": self.servers[0].rest_password } _mc = MemcachedClientHelper.direct_client(_node, "default") pid = _mc.stats()["pid"] node_rest = RestConnection(_node) command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = node_rest.diag_eval(command) self.log.info("killed ?? {0} ".format(killed)) _mc.close() start = time.time() memcached_restarted = False for server in self.servers: mc = None while time.time() - start < 60: try: mc = MemcachedClientHelper.direct_client(server, "default") stats = mc.stats() new_uptime = int(stats["uptime"]) if new_uptime < map["{0}:{1}".format( server.ip, server.port)]["previous_uptime"]: self.log.info("memcached restarted...") memcached_restarted = True break except Exception: self.log.error("unable to connect to {0}:{1}".format( server.ip, server.port)) if mc: mc.close() time.sleep(1) if not memcached_restarted: self.fail("memcached did not start {0}:{1}".format( server.ip, server.port)) for server in self.servers: mc = MemcachedClientHelper.direct_client(server, "default") expected_curr_items_tot = map["{0}:{1}".format( server.ip, server.port)]["curr_items_tot"] now_items = 0 start = time.time() if server == self.servers[0]: wait_time = 600 else: wait_time = 60 # Try to get the stats for 10 minutes, else hit out. while time.time() - start < wait_time: # Get the wamrup time for each server try: stats = mc.stats() if stats is not None: warmup_time = int(stats["ep_warmup_time"]) self.log.info("ep_warmup_time is %s " % warmup_time) self.log.info( "Collected the stats {0} for server {1}:{2}". format(stats["ep_warmup_time"], server.ip, server.port)) break else: self.log.info( " Did not get the stats from the server yet, trying again....." ) time.sleep(2) except Exception as e: self.log.error( "Could not get warmup_time stats from server {0}:{1}, exception {2}" .format(server.ip, server.port, e)) else: self.fail( "Fail! Unable to get the warmup-stats from server {0}:{1} after trying for {2} seconds." .format(server.ip, server.port, wait_time)) # Verify the item count from each server, if you get repeated same count(< expected count) for over # 3 minutes, then fail. Try to get the items from the server for 30 mins in total, else fail start = time.time() while time.time() - start < 1800: time.sleep(2) if mc.stats()["curr_items_tot"] < expected_curr_items_tot: self.log.info( "still warming up .... curr_items_tot : {0}".format( mc.stats()["curr_items_tot"])) while now_items == mc.stats()["curr_items_tot"]: if time.time() - start <= 180: self.log.info( "still warming up .... curr_items_tot : {0}". format(mc.stats()["curr_items_tot"])) else: self.fail( "Getting repetitive data, exiting from this server" ) else: self.log.info( "warmup completed, awesome!!! Warmed up. {0} items ". format(mc.stats()["curr_items_tot"])) break now_items = mc.stats()["curr_items_tot"] mc.close()