def test_auto_load_balance(self): """ """ log.info(f"start to install milvus") release_name, host, port = install_milvus( "test-auto-load-balance") # todo add release name self.release_name = release_name assert host is not None conn = connections.connect("default", host=host, port=port) assert conn is not None self.health_checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } cc.start_monitor_threads(self.health_checkers) # wait sleep(constants.WAIT_PER_OP * 10) all_collections = list_collections() for c in all_collections: seg_info = utility.get_query_segment_info(c) seg_distribution = cf.get_segment_distribution(seg_info) for k in seg_distribution.keys(): log.info( f"collection {c}'s segment distribution in node {k} is {seg_distribution[k]['sealed']}" ) # first assert log.info("first assert") assert_statistic(self.health_checkers) # scale up log.info("scale up milvus") scale_up_milvus(self.release_name) # reset counting cc.reset_counting(self.health_checkers) sleep(constants.WAIT_PER_OP * 10) all_collections = list_collections() for c in all_collections: seg_info = utility.get_query_segment_info(c) seg_distribution = cf.get_segment_distribution(seg_info) for k in seg_distribution.keys(): log.info( f"collection {c}'s sealed segment distribution in node {k} is {seg_distribution[k]['sealed']}" ) # second assert log.info("second assert") assert_statistic(self.health_checkers) # TODO assert segment distribution # assert all expectations assert_expectations()
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) release_name = meta_name chaos_config_str = json.dumps(chaos_config) chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name) chaos_config = json.loads(chaos_config_str) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml, chaos_config) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: ts = time.strftime("%Y-%m-%d %H:%M:%S") f.write(f"{meta_name}-{ts}\n") f.write("1st assert before chaos:\n") f.write(record_results(self.health_checkers)) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:\n") f.write(record_results(self.health_checkers)) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") log.info(f'Alive threads: {threading.enumerate()}') sleep(2) # wait all pods ready log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:\n") f.write(record_results(self.health_checkers)) # assert all expectations assert_expectations() log.info("*********************Chaos Test Completed**********************")
def test_multi_replicas_with_only_one_group_available( self, chaos_type, failed_node_type, failed_group_scope, is_streaming): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info("Test config") log.info(cc.gen_experiment_config(config_file_name)) # log.info(f"chaos_yaml: {chaos_yaml}") log.info(connections.get_connection_addr('default')) if is_streaming is False: del self.health_checkers[Op.insert] cc.start_monitor_threads(self.health_checkers) # get replicas info release_name = self.instance_name querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) group_list = [] shard_leader_list = [] replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() for g in replicas_info.groups: group_list.append(list(g.group_nodes)) for shard in g.shards: shard_leader_list.append(shard.shard_leader) # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos, # In the effected groups, each group has one pod is in pod failure status target_pod_list = [] target_group = [] group_list = sorted(group_list, key=lambda x: -len(x)) if failed_group_scope == "one": target_group = random.sample(group_list, 1) if failed_group_scope == "except_one": target_group = random.sample(group_list, len(group_list) - 1) if failed_group_scope == "all": target_group = group_list[:] for g in target_group: target_nodes = [] if failed_node_type == "shard_leader": target_nodes = list(set(g) & set(shard_leader_list)) if failed_node_type == "non_shard_leader": target_nodes = list(set(g) - set(shard_leader_list)) if len(target_nodes) == 0: log.info("there is no node satisfied, chose one randomly") target_nodes = [random.choice(g)] for target_node in target_nodes: pod = querynode_id_pod_pair[target_node] target_pod_list.append(pod) log.info(f"target_pod_list: {target_pod_list}") chaos_config = cc.gen_experiment_config( f"{str(Path(__file__).absolute().parent)}/chaos_objects/template/{chaos_type}-by-pod-list.yaml" ) chaos_config['metadata'][ 'name'] = f"test-multi-replicase-{int(time.time())}" meta_name = chaos_config.get('metadata', None).get('name', None) chaos_config['spec']['selector']['pods'][ 'chaos-testing'] = target_pod_list self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 2) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic log.info("******2nd assert after chaos injected: ") expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC} if failed_group_scope == "all": expectations = { Op.search: constants.FAIL, Op.query: constants.FAIL } assert_statistic(self.health_checkers, expectations=expectations) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) ready_1 = wait_pods_ready( constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") if ready_1 and ready_2: log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) # cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) sleep(30) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) self.checker_threads = cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report meta_name = chaos_config.get('metadata', None).get('name', None) dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("1st assert before chaos: ") f.write(f"{self.health_checkers}\n") # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2.1) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) for k, t in self.checker_threads.items(): log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:") f.write(f"{self.health_checkers}\n") # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:") f.write(f"{self.health_checkers}\n") # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_bulk_load(self, chaos_type, target_component): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) release_name = self.instance_name cc.start_monitor_threads(self.health_checkers) chaos_config = cc.gen_experiment_config( f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml" ) chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}" kind = chaos_config['kind'] meta_name = chaos_config.get('metadata', None).get('name', None) update_key_value(chaos_config, "release", release_name) update_key_value(chaos_config, "app.kubernetes.io/instance", release_name) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 10) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 10) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.bulk_load: constants.FAIL, }) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # recheck failed tasks in third assert self.health_checkers[Op.bulk_load].recheck_failed_task = True # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 10) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")