def delete_bucket_or_assert(serverInfo, bucket='default', test_case=None): log = logger.Logger.get_logger() log.info('deleting existing bucket {0} on {1}'.format( bucket, serverInfo)) rest = RestConnection(serverInfo) if RestHelper(rest).bucket_exists(bucket): status = rest.delete_bucket(bucket) if not status: try: BucketOperationHelper.print_dataStorage_content( [serverInfo]) log.info( StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format( bucket, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format( bucket) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion( bucket, rest, 200): try: BucketOperationHelper.print_dataStorage_content( [serverInfo]) log.info( StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except Exception as e: log.error(e) log.error( '15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() if len(buckets) > 0: log.info('deleting existing buckets {0} on {1}'.format( [b.name for b in buckets], serverInfo.ip)) for bucket in buckets: log.info("remove bucket {0} ...".format(bucket.name)) try: status = rest.delete_bucket(bucket.name) except ServerUnavailableException as e: log.error(e) log.error( '5 seconds sleep before calling delete_bucket again...' ) time.sleep(5) status = rest.delete_bucket(bucket.name) if not status: try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format( bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format( bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion( bucket.name, rest, 200): try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg) log.info( "sleep 2 seconds to make sure all buckets ({}) were deleted completely." .format([b.name for b in buckets])) time.sleep(2)
def delete_all_buckets_or_assert(servers, test_case, timeout=200): log = logger.Logger.get_logger() for serverInfo in servers: if serverInfo.dummy: continue rest = RestConnection(serverInfo) # retrying to get buckets with poll_interval and limit of retries buckets = rest.get_buckets(num_retries=3, poll_interval=5) if len(buckets) > 0: log.info('deleting existing buckets {0} on {1}'.format( [b.name for b in buckets], serverInfo.ip)) for bucket in buckets: # trying to send rest call to delete bucket with poll_interval and limit of retries status = rest.delete_bucket(bucket.name, num_retries=3, poll_interval=5) if not status: try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") # trying to check if bucket already deleted? poll_interval=0.1, timeout=200 is_bucket_deleted = BucketOperationHelper.wait_for_bucket_deletion( bucket.name, rest, timeout) if not is_bucket_deleted: try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") if test_case: msg = 'bucket "{0}" was not deleted even after waiting for {1} seconds.'.format( bucket.name, timeout) test_case.fail(msg) else: log.info('deleted bucket : {0} from {1}'.format( bucket.name, serverInfo.ip)) else: log.info( "Could not find any buckets for node {0}, nothing to delete" .format(serverInfo.ip))
def checkpoint_collapse(self): """With 3 replicas, stop replication on R2, let Master and R1 close checkpoint. Run load until a new checkpoint is created on Master and R1. Wait till checkpoints merge on R1. Restart replication of R2. Checkpoint should advance to the latest on R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) self._stop_replication(self.replica2, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) tasks = [] chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '>=', self.num_items)) data_load_thread.join() for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(self.replica2, self.bucket) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '<', self.num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to replica2") self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_create_items(self): """Load data until a new checkpoint is created on all replicas""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) chk_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self._verify_checkpoint_id(param, stat_key, chk_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def _verify_checkpoint_id(self, param, stat_key, m_stats): timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 #verify checkpiont id increases on master node chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("New checkpoint not created") time.sleep(timeout / 10) # verify Master and all replicas are in sync with checkpoint ids m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] for server in self.servers: tasks.append( self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '==', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail( "Master and all replicas are NOT in sync with checkpoint ids" )
def checkpoint_create_time(self): """Load data, but let the timeout create a new checkpoint on all replicas""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(self.timeout)) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) chk_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self.log.info("Sleeping for {0} seconds)".format(self.timeout + 5)) time.sleep(self.timeout + 5) self._verify_checkpoint_id(param, stat_key, chk_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def _verify_checkpoint_id(self, param, stat_key, m_stats): timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 #verify checkpiont id increases on master node chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("New checkpoint not created") time.sleep(timeout / 10) # verify Master and all replicas are in sync with checkpoint ids m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] for server in self.servers: tasks.append(self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '==', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("Master and all replicas are NOT in sync with checkpoint ids")
def checkpoint_server_down(self): """Load N items. Shut down server R2. Then Restart R2 and verify backfill happens on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp(self.replica2, self.replica3) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_server(self.replica2) time.sleep(5) data_load_thread = Thread(target=self._load_data_use_workloadgen, name="load_data", args=(self.master,)) data_load_thread.start() data_load_thread.join() self._start_server(self.replica2) time.sleep(5) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1, True) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2, True)
def _warmup_check(self, timeout=1800): warmed_up = {} stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() warmed_up[bucket.name] = {} for server in self.servers: warmed_up[bucket.name][server] = False for bucket in self.buckets: for server in self.servers: start = time.time() end_time = start + timeout warmup_complete = False while not warmup_complete and time.time() < end_time: try: if stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_thread')[server] == "complete": self.log.info("warmup completed for %s in bucket %s" % (server.ip, bucket.name)) warmup_complete = True elif stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_thread')[server] == "running": self.log.info("warming up is still running for %s in bucket %s....curr_items_tot : %s" % (server.ip, bucket.name, stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items_tot')[server])) warmup_time = int(stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_time')[server]) if warmup_time is not None: self.log.info("ep_warmup_time is %s for %s in bucket %s" % (warmup_time, server.ip, bucket.name)) self.sleep(5, "waiting for warmup...") except Exception as e: self.log.error("Could not get warmup_time stats from server %s:%s, exception %s" % (server.ip, server.port, e)) self.assertTrue(warmup_complete, "Warm up wasn't complete in %s sec" % timeout) start = time.time() while time.time() - start < self.timeout and not warmed_up[bucket.name][server]: if stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items_tot')[server] == \ self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items_tot"]: if stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items')[server] == \ self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items"]: if self._warmup_check_without_access_log(): warmed_up[bucket.name][server] = True self._stats_report(server, bucket, stats_all_buckets[bucket.name]) else: self.log.info("curr_items is %s not equal to %s" % (stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items')[server], self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items"])) else: self.log.info("curr_items_tot is %s not equal to %s" % (stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items_tot')[server], self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items_tot"])) self.sleep(10) for bucket in self.buckets: for server in self.servers: if warmed_up[bucket.name][server] == True: continue elif warmed_up[bucket.name][server] == False: return False return True
def _warmup_check_without_access_log(self): if not self.without_access_log: return True warmed_up = {} stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() warmed_up[bucket.name] = {} for server in self.servers: warmed_up[bucket.name][server] = False for bucket in self.buckets: for server in self.servers: start = time.time() while time.time() - start < self.timeout and not warmed_up[ bucket.name][server]: if stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_key_count')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_min_item_threshold')[server] or \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'mem_used')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_min_memory_threshold')[server] or \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'mem_used')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'ep_mem_low_wat')[server]: warmed_up[bucket.name][server] = True else: self.log.info( "curr_items is %s and ep_warmup_min_item_threshold is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'curr_items')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_min_item_threshold')[server])) self.log.info( "vb_active_perc_mem_resident is %s and ep_warmup_min_memory_threshold is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'vb_active_perc_mem_resident')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_min_memory_threshold')[server])) self.log.info( "mem_used is %s and ep_mem_low_wat is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'mem_used')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'ep_mem_low_wat')[server])) self.sleep(5) for bucket in self.buckets: for server in self.servers: if warmed_up[bucket.name][server] == True: continue elif warmed_up[bucket.name][server] == False: return False return True
def _get_backfill_timestamp(self, server, replica_server): param = 'tap' stat_key = 'eq_tapq:replication_ns_1@%s:backfill_start_timestamp' % ( replica_server.ip) m_stats = StatsCommon.get_stats([server], self.bucket, param, stat_key) self.log.info( "eq_tapq:replication_ns_1@%s:backfill_start_timestamp: %s" % (replica_server.ip, m_stats[m_stats.keys()[0]])) return int(m_stats[m_stats.keys()[0]])
def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except Exception as e: log.error(e) log.error('15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() log.info('deleting existing buckets {0} on {1}'.format([b.name for b in buckets], serverInfo.ip)) for bucket in buckets: log.info("remove bucket {0} ...".format(bucket.name)) try: status = rest.delete_bucket(bucket.name) except ServerUnavailableException as e: log.error(e) log.error('5 seconds sleep before calling delete_bucket again...') time.sleep(5) status = rest.delete_bucket(bucket.name) if not status: try: BucketOperationHelper.print_dataStorage_content(servers) log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format(bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format(bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion(bucket.name, rest, 200): try: BucketOperationHelper.print_dataStorage_content(servers) log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg) log.info("sleep 2 seconds to make sure all buckets were deleted completely.") time.sleep(2)
def checkpoint_replication_pause(self): """With 3 replicas load data. pause replication to R2. Let checkpoints close on Master and R1. Restart replication of R2 and R3, backfill should not be seen on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) time.sleep(5) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp( self.replica2, self.replica3) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._stop_replication(self.replica2, self.bucket) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not closed") data_load_thread.join() self._start_replication(self.replica2, self.bucket) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2)
def _create_access_log(self): stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() for bucket in self.buckets: for server in self.servers: scanner_runs = stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'ep_num_access_scanner_runs')[server] self.log.info("current access scanner run for %s in bucket %s is %s times" % (server.ip, bucket.name, scanner_runs)) self.log.info("setting access scanner time %s minutes for %s in bucket %s" % (self.access_log_time, server.ip, bucket.name)) ClusterOperationHelper.flushctl_set(server, "alog_sleep_time", self.access_log_time , bucket.name) if not self._wait_for_access_run(self.access_log_time, scanner_runs, server, bucket, stats_all_buckets[bucket.name]): self.fail("Not able to create access log within %s minutes" % self.access_log_time)
def delete_bucket_or_assert(serverInfo, bucket='default', test_case=None): log = logger.Logger.get_logger() log.info('deleting existing bucket {0} on {1}'.format(bucket, serverInfo)) rest = RestConnection(serverInfo) if RestHelper(rest).bucket_exists(bucket): status = rest.delete_bucket(bucket) if not status: try: BucketOperationHelper.print_dataStorage_content([serverInfo]) log.info(StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format(bucket, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format(bucket) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion(bucket, rest, 200): try: BucketOperationHelper.print_dataStorage_content([serverInfo]) log.info(StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def checkpoint_collapse(self): """With 3 replicas, stop replication on R2, let Master and R1 close checkpoint. Run load until a new checkpoint is created on Master and R1. Wait till checkpoints merge on R1. Restart replication of R2. Checkpoint should advance to the latest on R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) self._stop_replication(self.replica2, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) tasks = [] chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '>=', self.num_items)) data_load_thread.join() for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(self.replica2, self.bucket) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '<', self.num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to replica2") self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_server_down(self): """Load N items. Shut down server R2. Then Restart R2 and verify backfill happens on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp( self.replica2, self.replica3) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_server(self.replica2) time.sleep(5) data_load_thread = Thread(target=self._load_data_use_workloadgen, name="load_data", args=(self.master, )) data_load_thread.start() data_load_thread.join() self._start_server(self.replica2) time.sleep(5) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1, True) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2, True)
def checkpoint_replication_pause_failover(self): """Load N items. Stop replication R3. Load N' more items. Failover R2. When restart replication to R3, verify backfill doesn't happen on R1.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) nodes = rest.node_statuses() failover_node = None for node in nodes: if node.id.find(self.replica2.ip) >= 0: failover_node = node self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_replication(self.replica3, self.bucket) generate_load_two = BlobGenerator('sqlite', 'sqlite-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load_two, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() failed_over = rest.fail_over(failover_node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(failover_node.id) self.assertTrue(failed_over, "unable to failover node %s".format(self.replica2.ip)) self.log.info("failed over node : {0}".format(failover_node.id)) data_load_thread.join() self._start_replication(self.replica3, self.bucket) self.servers = [] self.servers = [self.master, self.replica1, self.replica3] self.num_servers = len(self.servers) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self.cluster.rebalance([self.master, self.replica1, self.replica2, self.replica3], [], [self.replica2]) self.cluster.rebalance([self.master, self.replica1, self.replica3], [self.replica2], [])
def checkpoint_deduplication(self): """Disable replication of R1. Load N items to master, then mutate some of them. Restart replication of R1, only N items should be in stats. In this test, we can only load number of items <= checkpoint_size to observe deduplication""" param = 'checkpoint' stat_key = 'vb_0:num_open_checkpoint_items' stat_key_id = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) self._stop_replication(self.replica1, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) generate_update = BlobGenerator('nosql', 'sql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets([self.master, self.replica2, self.replica3]) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key_id) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_update, "update", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._start_replication(self.replica1, self.bucket) data_load_thread.join() chk_pnt = int(m_stats[m_stats.keys()[0]]) timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 time.sleep(timeout) tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '==', self.num_items)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '==', self.num_items)) tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key_id, '==', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key_id, '==', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Items weren't deduplicated") self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_replication_pause(self): """With 3 replicas load data. pause replication to R2. Let checkpoints close on Master and R1. Restart replication of R2 and R3, backfill should not be seen on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) time.sleep(5) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp(self.replica2, self.replica3) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._stop_replication(self.replica2, self.bucket) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not closed") data_load_thread.join() self._start_replication(self.replica2, self.bucket) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2)
def load_document_until_ram_percentage(self): self.start = 0 self.num_items = 30000 self.end = self.num_items while True: self.log.info("Add documents to bucket") self.perform_doc_ops_in_all_cb_buckets(self.num_items, "create", self.start, self.end) self.log.info("Calculate available free memory") stats_all_buckets = {} stats_all_buckets[self.cb_bucket_name] = StatsCommon() memory_used = int(stats_all_buckets[self.cb_bucket_name].get_stats( [self.master], self.cb_bucket_name, '', 'mem_used')[self.servers[0]]) if memory_used < (self.document_ram_percentage * self.bucket_ram * 1000000): self.log.info("Continue loading we have more free memory") self.start = self.end self.end = self.end + self.num_items else: break
def test_verify_mb8825(self): # Setting up replication clusters. src_cluster_name, dest_cluster_name = "remote-dest-src", "remote-src-dest" self.__setup_replication_clusters(self.src_master, self.dest_master, src_cluster_name, dest_cluster_name) # Step-3 Load 10k items ( sets=80, deletes=20) on source cluster. self._load_all_buckets(self.src_master, self.gen_create, "create", 0) # Step-4 XDCR Source -> Remote self._replicate_clusters(self.src_master, dest_cluster_name) self.merge_buckets(self.src_master, self.dest_master, bidirection=False) # Step-5 Wait for replication to finish 50% at destination node expected_items = (self.gen_create.end) * 0.5 dest_master_buckets = self._get_cluster_buckets(self.dest_master) tasks = [] for bucket in dest_master_buckets: tasks.append(self.cluster.async_wait_for_stats([self.dest_master], bucket, '', 'curr_items', '>=', expected_items)) for task in tasks: task.result(self.wait_timeout * 5) # Perform 20% delete on Source cluster. tasks = [] self.gen_delete = BlobGenerator('loadOne', 'loadOne-', self._value_size, start=0, end=int((self.num_items) * (float)(self._percent_delete) / 100)) tasks.extend(self._async_load_all_buckets(self.src_master, self.gen_delete, "delete", 0)) # Step-6 XDCR Remote -> Source self._replicate_clusters(self.dest_master, src_cluster_name) self.merge_buckets(self.dest_master, self.src_master, bidirection=False) # Wait for delete tasks to be finished for task in tasks: task.result() # Step-8 Compare the source and destination cluster items - item count, meta data, data content. self.verify_results() # Verify if no deletion performed at source node: src_master_buckets = self._get_cluster_buckets(self.src_master) for bucket in src_master_buckets: src_stat_ep_num_ops_del_meta = 0 src_stat_ep_num_ops_set_meta = 0 src_stat_ep_num_ops_get_meta = 0 src_stat_ep_num_ops_del_meta_res_fail = 0 src_stat_ep_num_ops_set_meta_res_fail = 0 for src_node in self.src_nodes: src_stat_ep_num_ops_del_meta += int(StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_del_meta')[src_node]) src_stat_ep_num_ops_set_meta += int(StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_set_meta')[src_node]) src_stat_ep_num_ops_get_meta += int(StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_get_meta')[src_node]) src_stat_ep_num_ops_del_meta_res_fail += int(StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_del_meta_res_fail')[src_node]) src_stat_ep_num_ops_set_meta_res_fail += int(StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_set_meta_res_fail')[src_node]) self.assertEqual(src_stat_ep_num_ops_set_meta, 0, "Number of set [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_set_meta, bucket)) self.assertEqual(src_stat_ep_num_ops_del_meta, 0, "Number of delete [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_del_meta, bucket)) dest_stat_ep_num_ops_del_meta = 0 for dest_node in self.dest_nodes: dest_stat_ep_num_ops_del_meta += int(StatsCommon.get_stats([dest_node], bucket, '', 'ep_num_ops_del_meta')[dest_node]) if self.rep_type == "xmem": self.assertEqual(src_stat_ep_num_ops_del_meta_res_fail, dest_stat_ep_num_ops_del_meta, "Number of failed delete [%s] operation occurs at bucket = %s, while expected to %s" % (src_stat_ep_num_ops_del_meta_res_fail, bucket, dest_stat_ep_num_ops_del_meta)) self.assertTrue(src_stat_ep_num_ops_set_meta_res_fail > 0, "Number of failed set [%s] operation occurs at bucket = %s, while expected greater than 0" % (src_stat_ep_num_ops_set_meta_res_fail, bucket)) elif self.rep_type == "capi": self.assertTrue(src_stat_ep_num_ops_get_meta > 0, "Number of get [%s] operation occurs at bucket = %s, while expected greater than 0" % (src_stat_ep_num_ops_get_meta, bucket))
def test_items_append(self): self.desired_item_size = self.input.param("desired_item_size", 2048) self.append_size = self.input.param("append_size", 1024) self.fixed_append_size = self.input.param("fixed_append_size", True) self.append_ratio = self.input.param("append_ratio", 0.5) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=1000, pause_secs=5, timeout_secs=100) for bucket in self.buckets: self.value_size = self.input.param("value_size", 512) verify_dict = {} vkeys, dkeys = bucket.kvs[1].key_set() key_count = len(vkeys) app_ratio = self.append_ratio * key_count selected_keys = [] i = 0 for key in vkeys: i += 1 if i >= app_ratio: break selected_keys.append(key) awareness = VBucketAwareMemcached(RestConnection(self.master), bucket.name) if self.kv_verify: for key in selected_keys: value = awareness.memcached(key).get(key)[2] verify_dict[key] = value self.log.info("Bucket: {0}".format(bucket.name)) self.log.info("Appending to have items whose initial size was " + "{0} to equal or cross a size of {1}".format(self.value_size, self.desired_item_size)) self.log.info("Item-appending of {0} items starting ..".format(len(selected_keys) + 1)) index = 3 while self.value_size < self.desired_item_size: str_len = self.append_size if not self.fixed_append_size: str_len = int(math.pow(2, index)) for key in selected_keys: random_string = self.random_str_generator(str_len) awareness.memcached(key).append(key, random_string) if self.kv_verify: verify_dict[key] = verify_dict[key] + random_string self.log.info("for {0} items size was increased to {1} Bytes".format(len(selected_keys) + 1, self.value_size)) self.value_size += str_len index += 1 self.log.info("The appending of {0} items ended".format(len(selected_keys) + 1)) for bucket in self.buckets: msg = "Bucket:{0}".format(bucket.name) self.log.info("VERIFICATION <" + msg + ">: Phase 0 - Check the gap between " + "mem_used by the bucket and total_allocated_bytes") stats = StatsCommon() mem_used_stats = stats.get_stats(self.servers, bucket, 'memory', 'mem_used') total_allocated_bytes_stats = stats.get_stats(self.servers, bucket, 'memory', 'total_allocated_bytes') total_fragmentation_bytes_stats = stats.get_stats(self.servers, bucket, 'memory', 'total_fragmentation_bytes') for server in self.servers: self.log.info("In {0} bucket {1}, total_fragmentation_bytes + the total_allocated_bytes = {2}" .format(server.ip, bucket.name, (int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server])))) self.log.info("In {0} bucket {1}, mem_used = {2}".format(server.ip, bucket.name, mem_used_stats[server])) self.log.info("In {0} bucket {1}, the difference between actual memory used by memcached and mem_used is {2} times" .format(server.ip, bucket.name, float(int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server])) / float(mem_used_stats[server]))) self.log.info("VERIFICATION <" + msg + ">: Phase1 - Check if any of the " + "selected keys have value less than the desired value size") for key in selected_keys: value = awareness.memcached(key).get(key)[2] if len(value) < self.desired_item_size: self.fail("Failed to append enough to make value size surpass the " + "size {0}, key {1} has size {2}".format(self.desired_item_size, key, len(value))) if self.kv_verify: self.log.info("VERIFICATION <" + msg + ">: Phase2 - Check if the content " + "after the appends match what's expected") for k in verify_dict: if awareness.memcached(k).get(k)[2] != verify_dict[k]: self.fail("Content at key {0}: not what's expected.".format(k)) self.log.info("VERIFICATION <" + msg + ">: Successful") shell = RemoteMachineShellConnection(self.master) shell.execute_cbstats("", "raw", keyname="allocator", vbid="") shell.disconnect()
def stat(self, key): stats = StatsCommon.get_stats([self.master], 'default', "", key) val = stats.values()[0] if val.isdigit(): val = int(val) return val
def test_items_append(self): self.desired_item_size = self.input.param("desired_item_size", 2048) self.append_size = self.input.param("append_size", 1024) self.fixed_append_size = self.input.param("fixed_append_size", True) self.append_ratio = self.input.param("append_ratio", 0.5) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=1000, pause_secs=5, timeout_secs=100) for bucket in self.buckets: self.value_size = self.input.param("value_size", 512) verify_dict = {} vkeys, dkeys = bucket.kvs[1].key_set() key_count = len(vkeys) app_ratio = self.append_ratio * key_count selected_keys = [] i = 0 for key in vkeys: i += 1 if i >= app_ratio: break selected_keys.append(key) awareness = VBucketAwareMemcached(RestConnection(self.master), bucket.name) if self.kv_verify: for key in selected_keys: value = awareness.memcached(key).get(key)[2] verify_dict[key] = value self.log.info("Bucket: {0}".format(bucket.name)) self.log.info("Appending to have items whose initial size was " + "{0} to equal or cross a size of {1}".format( self.value_size, self.desired_item_size)) self.log.info("Item-appending of {0} items starting ..".format( len(selected_keys) + 1)) index = 3 while self.value_size < self.desired_item_size: str_len = self.append_size if not self.fixed_append_size: str_len = int(math.pow(2, index)) for key in selected_keys: random_string = self.random_str_generator(str_len) awareness.memcached(key).append(key, random_string) if self.kv_verify: verify_dict[key] = verify_dict[key] + random_string self.log.info( "for {0} items size was increased to {1} Bytes".format( len(selected_keys) + 1, self.value_size)) self.value_size += str_len index += 1 self.log.info("The appending of {0} items ended".format( len(selected_keys) + 1)) for bucket in self.buckets: msg = "Bucket:{0}".format(bucket.name) self.log.info("VERIFICATION <" + msg + ">: Phase 0 - Check the gap between " + "mem_used by the bucket and total_allocated_bytes") stats = StatsCommon() mem_used_stats = stats.get_stats(self.servers, bucket, 'memory', 'mem_used') total_allocated_bytes_stats = stats.get_stats( self.servers, bucket, 'memory', 'total_allocated_bytes') total_fragmentation_bytes_stats = stats.get_stats( self.servers, bucket, 'memory', 'total_fragmentation_bytes') for server in self.servers: self.log.info( "In {0} bucket {1}, total_fragmentation_bytes + the total_allocated_bytes = {2}" .format(server.ip, bucket.name, (int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server])))) self.log.info("In {0} bucket {1}, mem_used = {2}".format( server.ip, bucket.name, mem_used_stats[server])) self.log.info( "In {0} bucket {1}, the difference between actual memory used by memcached and mem_used is {2} times" .format( server.ip, bucket.name, float( int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server])) / float(mem_used_stats[server]))) self.log.info( "VERIFICATION <" + msg + ">: Phase1 - Check if any of the " + "selected keys have value less than the desired value size") for key in selected_keys: value = awareness.memcached(key).get(key)[2] if len(value) < self.desired_item_size: self.fail( "Failed to append enough to make value size surpass the " + "size {0}, key {1} has size {2}".format( self.desired_item_size, key, len(value))) if self.kv_verify: self.log.info("VERIFICATION <" + msg + ">: Phase2 - Check if the content " + "after the appends match what's expected") for k in verify_dict: if awareness.memcached(k).get(k)[2] != verify_dict[k]: self.fail( "Content at key {0}: not what's expected.".format( k)) self.log.info("VERIFICATION <" + msg + ">: Successful") shell = RemoteMachineShellConnection(self.master) shell.execute_cbstats("", "raw", keyname="allocator", vbid="") shell.disconnect()
def test_verify_mb8825(self): # Setting up replication clusters. src_cluster_name, dest_cluster_name = "remote-dest-src", "remote-src-dest" self.__setup_replication_clusters(self.src_master, self.dest_master, src_cluster_name, dest_cluster_name) # Step-3 Load 10k items ( sets=80, deletes=20) on source cluster. self._load_all_buckets(self.src_master, self.gen_create, "create", 0) # Step-4 XDCR Source -> Remote self._replicate_clusters(self.src_master, dest_cluster_name) self.merge_buckets(self.src_master, self.dest_master, bidirection=False) # Step-5 Wait for replication to finish 50% at destination node expected_items = (self.gen_create.end) * 0.5 dest_master_buckets = self._get_cluster_buckets(self.dest_master) tasks = [] for bucket in dest_master_buckets: tasks.append( self.cluster.async_wait_for_stats([self.dest_master], bucket, '', 'curr_items', '>=', expected_items)) for task in tasks: task.result(self.wait_timeout * 5) # Perform 20% delete on Source cluster. tasks = [] self.gen_delete = BlobGenerator( 'loadOne', 'loadOne-', self._value_size, start=0, end=int((self.num_items) * (float)(self._percent_delete) / 100)) tasks.extend( self._async_load_all_buckets(self.src_master, self.gen_delete, "delete", 0)) # Step-6 XDCR Remote -> Source self._replicate_clusters(self.dest_master, src_cluster_name) self.merge_buckets(self.dest_master, self.src_master, bidirection=False) # Wait for delete tasks to be finished for task in tasks: task.result() # Step-8 Compare the source and destination cluster items - item count, meta data, data content. self.verify_results() # Verify if no deletion performed at source node: src_master_buckets = self._get_cluster_buckets(self.src_master) for bucket in src_master_buckets: src_stat_ep_num_ops_del_meta = 0 src_stat_ep_num_ops_set_meta = 0 src_stat_ep_num_ops_get_meta = 0 src_stat_ep_num_ops_del_meta_res_fail = 0 src_stat_ep_num_ops_set_meta_res_fail = 0 for src_node in self.src_nodes: src_stat_ep_num_ops_del_meta += int( StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_del_meta')[src_node]) src_stat_ep_num_ops_set_meta += int( StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_set_meta')[src_node]) src_stat_ep_num_ops_get_meta += int( StatsCommon.get_stats([src_node], bucket, '', 'ep_num_ops_get_meta')[src_node]) src_stat_ep_num_ops_del_meta_res_fail += int( StatsCommon.get_stats( [src_node], bucket, '', 'ep_num_ops_del_meta_res_fail')[src_node]) src_stat_ep_num_ops_set_meta_res_fail += int( StatsCommon.get_stats( [src_node], bucket, '', 'ep_num_ops_set_meta_res_fail')[src_node]) self.assertEqual( src_stat_ep_num_ops_set_meta, 0, "Number of set [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_set_meta, bucket)) self.assertEqual( src_stat_ep_num_ops_del_meta, 0, "Number of delete [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_del_meta, bucket)) dest_stat_ep_num_ops_del_meta = 0 for dest_node in self.dest_nodes: dest_stat_ep_num_ops_del_meta += int( StatsCommon.get_stats([dest_node], bucket, '', 'ep_num_ops_del_meta')[dest_node]) if self.rep_type == "xmem": self.assertEqual( src_stat_ep_num_ops_del_meta_res_fail, dest_stat_ep_num_ops_del_meta, "Number of failed delete [%s] operation occurs at bucket = %s, while expected to %s" % (src_stat_ep_num_ops_del_meta_res_fail, bucket, dest_stat_ep_num_ops_del_meta)) self.assertTrue( src_stat_ep_num_ops_set_meta_res_fail == 0, "Number of failed set [%s] operation occurs at bucket = %s, while expected 0" % (src_stat_ep_num_ops_set_meta_res_fail, bucket)) elif self.rep_type == "capi": self.assertTrue( src_stat_ep_num_ops_get_meta > 0, "Number of get [%s] operation occurs at bucket = %s, while expected greater than 0" % (src_stat_ep_num_ops_get_meta, bucket))
def _get_backfill_timestamp(self, server, replica_server): param = 'tap' stat_key = 'eq_tapq:replication_ns_1@%s:backfill_start_timestamp' % (replica_server.ip) m_stats = StatsCommon.get_stats([server], self.bucket, param, stat_key) self.log.info("eq_tapq:replication_ns_1@%s:backfill_start_timestamp: %s" % (replica_server.ip, m_stats[m_stats.keys()[0]])) return int(m_stats[m_stats.keys()[0]])
def checkpoint_replication_pause_failover(self): """Load N items. Stop replication R3. Load N' more items. Failover R2. When restart replication to R3, verify backfill doesn't happen on R1.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) nodes = rest.node_statuses() failover_node = None for node in nodes: if node.id.find(self.replica2.ip) >= 0: failover_node = node self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_replication(self.replica3, self.bucket) generate_load_two = BlobGenerator('sqlite', 'sqlite-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load_two, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() failed_over = rest.fail_over(failover_node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(failover_node.id) self.assertTrue(failed_over, "unable to failover node %s".format(self.replica2.ip)) self.log.info("failed over node : {0}".format(failover_node.id)) data_load_thread.join() self._start_replication(self.replica3, self.bucket) self.servers = [] self.servers = [self.master, self.replica1, self.replica3] self.num_servers = len(self.servers) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self.cluster.rebalance( [self.master, self.replica1, self.replica2, self.replica3], [], [self.replica2]) self.cluster.rebalance([self.master, self.replica1, self.replica3], [self.replica2], [])
def checkpoint_deduplication(self): """Disable replication of R1. Load N items to master, then mutate some of them. Restart replication of R1, only N items should be in stats. In this test, we can only load number of items <= checkpoint_size to observe deduplication""" param = 'checkpoint' stat_key = 'vb_0:num_open_checkpoint_items' stat_key_id = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) self._stop_replication(self.replica1, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) generate_update = BlobGenerator('nosql', 'sql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets( [self.master, self.replica2, self.replica3]) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key_id) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_update, "update", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._start_replication(self.replica1, self.bucket) data_load_thread.join() chk_pnt = int(m_stats[m_stats.keys()[0]]) timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 time.sleep(timeout) tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '==', self.num_items)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '==', self.num_items)) tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key_id, '==', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key_id, '==', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Items weren't deduplicated") self._verify_stats_all_buckets(self.servers[:self.num_servers])
def test_gsi_on_ephemeral_with_eviction_policy(self): num_of_docs = self.num_of_docs_per_collection self.prepare_collection_for_indexing(num_of_docs_per_collection=self.num_of_docs_per_collection) collection_namespace = self.namespaces[0] _, keyspace = collection_namespace.split(':') bucket, scope, collection = keyspace.split('.') index_gen = QueryDefinition(index_name='idx', index_fields=['age', 'country', 'city']) meta_index_gen = QueryDefinition(index_name='meta_idx', index_fields=['meta().id']) query = index_gen.generate_index_create_query(namespace=collection_namespace, defer_build=self.defer_build) self.run_cbq_query(query) if self.defer_build: build_query = index_gen.generate_build_query(namespace=collection_namespace) self.run_cbq_query(build_query) self.wait_until_indexes_online() query = meta_index_gen.generate_index_create_query(namespace=collection_namespace, defer_build=self.defer_build) self.run_cbq_query(query) if self.defer_build: build_query = meta_index_gen.generate_build_query(namespace=collection_namespace) self.run_cbq_query(build_query) self.wait_until_indexes_online() select_query = f'Select * from {collection_namespace} where age >10 and country like "A%";' select_meta_id_query = f'Select meta().id from {collection} where meta().id like "doc_%";' count_query = f'Select count(*) from {collection_namespace} where age >= 0;' named_collection_query_context = f'default:{bucket}.{scope}' select_result = self.run_cbq_query(query=select_query)['results'] meta_result = self.run_cbq_query(query=select_meta_id_query, query_context=named_collection_query_context)['results'] count_result = self.run_cbq_query(query=count_query)['results'][0]['$1'] self.assertTrue(len(select_result) > 0) self.assertEqual(len(meta_result), self.num_of_docs_per_collection) self.assertEqual(count_result, self.num_of_docs_per_collection) new_inserts = 10 ** 4 is_memory_full = False stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() threshold = 0.93 last_memory_used_val = 0 while not is_memory_full: gen_create = SDKDataLoader(num_ops=new_inserts, percent_create=100, percent_update=0, percent_delete=0, scope=scope, collection=collection, output=True, start_seq_num=num_of_docs+1) task = self.cluster.async_load_gen_docs(self.master, bucket, gen_create) task.result() # Updating the doc counts num_of_docs = num_of_docs + new_inserts self.sleep(30) memory_used = int(stats_all_buckets[bucket.name].get_stats([self.master], bucket, '', 'mem_used')[self.master]) self.log.info(f"Current memory usage: {memory_used}") if self.eviction_policy == 'noEviction': # memory is considered full if mem_used is at say 90% of the available memory if memory_used > threshold * self.bucket_size * 1000000: # Just filling the leftover memory to be double sure gen_create = SDKDataLoader(num_ops=new_inserts, percent_create=100, percent_update=0, percent_delete=0, scope=scope, collection=collection, output=True, start_seq_num=num_of_docs + 1) task = self.cluster.async_load_gen_docs(self.master, bucket, gen_create) task.result() num_of_docs = num_of_docs + new_inserts memory_used = int(stats_all_buckets[bucket.name].get_stats([self.master], bucket, '', 'mem_used')[self.master]) self.log.info(f"Current memory usage: {memory_used}") is_memory_full = True else: if memory_used < last_memory_used_val: break last_memory_used_val = memory_used meta_ids = self.run_cbq_query(query=select_meta_id_query, query_context=named_collection_query_context)['results'] ids_at_threshold = sorted([item['id'] for item in meta_ids]) # Pushing new docs to check the eviction policy new_inserts = 10 ** 4 gen_create = SDKDataLoader(num_ops=new_inserts, percent_create=100, json_template="Employee", percent_update=0, percent_delete=0, scope=scope, collection=collection, output=True, start_seq_num=num_of_docs+1) tasks = self.data_ops_javasdk_loader_in_batches(sdk_data_loader=gen_create, batch_size=10000) for task in tasks: out = task.result() self.log.info(out) meta_ids_with_eviction_enforced = self.run_cbq_query(query=select_meta_id_query, query_context=named_collection_query_context)['results'] ids_after_threshold = sorted([item['id'] for item in meta_ids_with_eviction_enforced]) if self.eviction_policy == 'noEviction': self.assertEqual(len(meta_ids_with_eviction_enforced), len(meta_ids)) self.assertEqual(ids_at_threshold, ids_after_threshold) else: self.assertTrue(len(meta_ids_with_eviction_enforced) != len(meta_ids)) self.assertTrue(ids_after_threshold != ids_at_threshold)