def test_memory_stress_replicas_group_load_balance(self, prepare_collection): """ target: test apply memory stress on replicas and load balance inside group method: 1.Deploy milvus and limit querynode memory 6Gi 2.Insret 1000,000 entities (500Mb), load 2 replicas (memory usage 1.5Gb) 3.Apply memory stress 4Gi on querynode expected: Verify that load balancing occurs """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() chaos_querynode_id = replicas.groups[0].group_nodes[0] label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) chaos_querynode_pod = querynode_id_pod_pair[chaos_querynode_id] # get the segment num before chaos seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) segments_num_before = len(seg_distribution_before[chaos_querynode_id]["sealed"]) log.debug(segments_num_before) log.debug(seg_distribution_before[chaos_querynode_id]["sealed"]) # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = [chaos_querynode_pod] log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {chaos_querynode_id}, pod {chaos_querynode_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_after = cf.get_segment_distribution(seg_info_after) segments_num_after = len(seg_distribution_after[chaos_querynode_id]["sealed"]) log.debug(segments_num_after) log.debug(seg_distribution_after[chaos_querynode_id]["sealed"]) assert segments_num_after < segments_num_before search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def parser_testcase_config(self, chaos_yaml, chaos_config): cluster_nodes = check_cluster_nodes(chaos_config) tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml' tests_config = cc.gen_experiment_config(tests_yaml) test_collections = tests_config.get('Collections', None) for t in test_collections: test_chaos = t.get('testcase', {}).get('chaos', {}) if test_chaos in chaos_yaml: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {}) # for the cluster_n_node if cluster_nodes > 1: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {}) log.info(f"yaml.expects: {expects}") self.expect_create = expects.get(Op.create.value, constants.SUCC) self.expect_insert = expects.get(Op.insert.value, constants.SUCC) self.expect_flush = expects.get(Op.flush.value, constants.SUCC) self.expect_index = expects.get(Op.index.value, constants.SUCC) self.expect_search = expects.get(Op.search.value, constants.SUCC) self.expect_query = expects.get(Op.query.value, constants.SUCC) log.info( f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, " f"flush:{self.expect_flush}, index:{self.expect_index}, " f"search:{self.expect_search}, query:{self.expect_query}") return True return False
def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml): """ target: test inject memory stress into indexnode method: 1.Deploy milvus and limit indexnode memory resource 3 / 4Gi 2.Create collection and insert some data 3.Inject memory stress chaos 512Mi 4.Create index expected: """ # init collection and insert nb = 256000 # vector size: 512*4*nb about 512Mi and create index need 2.8Gi memory dim = 512 # c_name = cf.gen_unique_str('chaos_memory') c_name = 'chaos_memory_gKs8aSUu' index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 128}} collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema(dim=dim), shards_num=1) # insert 256000 512 dim entities, size 512Mi for i in range(2): t0_insert = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb // 2 # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}') tt_insert = datetime.datetime.now() - t0_insert log.info(f"{i} insert data cost: {tt_insert}") # flush t0_flush = datetime.datetime.now() assert collection_w.num_entities == nb tt_flush = datetime.datetime.now() - t0_flush log.info(f'flush {nb * 10} entities cost: {tt_flush}') log.info(collection_w.indexes[0].params) if collection_w.has_index()[0]: collection_w.drop_index() # indexNode start build index, inject chaos memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("inject chaos") # create index t0_index = datetime.datetime.now() index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name, index_params=index_params) tt_index = datetime.datetime.now() - t0_index log.info(f"create index cost: {tt_index}") log.info(collection_w.indexes[0].params)
def apply_memory_stress(chaos_yaml): chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected")
def test_chaos_memory_stress_querynode(self, connection, chaos_yaml): """ target: explore query node behavior after memory stress chaos injected and recovered method: 1. Create a collection, insert some data 2. Inject memory stress chaos 3. Start a threas to load, search and query 4. After chaos duration, check query search success rate 5. Delete chaos or chaos finished finally expected: 1.If memory is insufficient, querynode is OOMKilled and available after restart 2.If memory is sufficient, succ rate of query and search both are 1.0 """ c_name = 'chaos_memory_nx6DNW4q' collection_w = ApiCollectionWrapper() collection_w.init_collection(c_name) log.debug(collection_w.schema) log.debug(collection_w._shards_num) # apply memory stress chaos chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace( 'm', '*60+').replace('s', '*1+') + '+0' meta_name = chaos_config.get('metadata').get('name') # wait memory stress sleep(constants.WAIT_PER_OP * 2) # try to do release, load, query and serach in a duration time loop try: start = time.time() while time.time() - start < eval(duration): collection_w.release() collection_w.load() term_expr = f'{ct.default_int64_field_name} in {[random.randint(0, 100)]}' query_res, _ = collection_w.query(term_expr) assert len(query_res) == 1 search_res, _ = collection_w.search( cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit) log.debug(search_res[0].ids) assert len(search_res[0].ids) == ct.default_limit except Exception as e: raise Exception(str(e)) finally: chaos_res.delete(meta_name)
def test_memory_stress_replicas_group_insufficient(self, prepare_collection, mode): """ target: test apply stress memory on different number querynodes and the group failed to load, bacause of the memory is insufficient method: 1.Limit querynodes memory 5Gi 2.Create collection and insert 1000,000 entities 3.Apply memory stress on querynodes and it's memory is not enough to load replicas expected: Verify load raise exception, and after delete chaos, load and search successfully """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") # Update config chaos_config['spec']['mode'] = mode chaos_config['spec']['stressors']['memory']['size'] = '5Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) # chaos_start = time.time() log.debug("chaos injected") sleep(10) try: # load failed err = {"err_code": 1, "err_msg": "shuffleSegmentsToQueryNodeV2: insufficient memory of available node"} collection_w.load(replica_number=5, timeout=60, check_task=CheckTasks.err_res, check_items=err) # query failed because not loaded err = {"err_code": 1, "err_msg": "not loaded into memory"} collection_w.query("int64 in [0]", check_task=CheckTasks.err_res, check_items=err) # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) sleep(10) # after delete chaos load and query successfully collection_w.load(replica_number=5, timeout=60) progress, _ = utility_w.loading_progress(collection_w.name) # assert progress["loading_progress"] == "100%" query_res, _ = collection_w.query("int64 in [0]") assert len(query_res) != 0 collection_w.release() except Exception as e: raise Exception(str(e)) finally: log.debug("Test finished")
def test_memory_stress_replicas_cross_group_load_balance(self, prepare_collection): """ target: test apply memory stress on one group and no load balance cross replica groups method: 1.Limit all querynodes memory 6Gi 2.Create and insert 1000,000 entities 3.Load collection with two replicas 4.Apply memory stress on one grooup 80% expected: Verify that load balancing across groups is not occurring """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() group_nodes = list(replicas.groups[0].group_nodes) label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) group_nodes_pod = [querynode_id_pod_pair[node_id] for node_id in group_nodes] # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = group_nodes_pod log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {group_nodes}, pod {group_nodes_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) seg_distribution_after = cf.get_segment_distribution(seg_info_after) for node_id in group_nodes: assert len(seg_distribution_before[node_id]) == len(seg_distribution_after[node_id]) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_chaos_memory_stress_etcd(self, chaos_yaml): """ target: test inject memory stress into all etcd pods method: 1.Deploy milvus and limit etcd memory resource 1Gi witl all mode 2.Continuously and concurrently do milvus operations 3.Inject memory stress chaos 51024Mi 4.After duration, delete chaos stress expected: Verify milvus operation succ rate """ mic_checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } # start thread keep running milvus op start_monitor_threads(mic_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) # duration = chaos_config["spec"]["duration"] meta_name = chaos_config.get('metadata').get('name') duration = chaos_config.get('spec').get('duration') # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("Chaos injected") # convert string duration time to a int number in seconds if isinstance(duration, str): duration = duration.replace('h', '*3600+').replace( 'm', '*60+').replace('s', '*1+') + '+0' else: log.error("Duration must be string type") # Delete experiment after it's over timer = threading.Timer(interval=eval(duration), function=chaos_res.delete, args=(meta_name, False)) timer.start() timer.join() # output milvus op succ rate for k, ch in mic_checkers.items(): log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}') assert ch.succ_rate() == 1.0
def parser_testcase_config(self, chaos_yaml, chaos_config): # TODO: need a better way (maybe recursion) to parse chaos_config # selector key is located in different depth when chaos config's kind is different # for now, there are two kinds of chaos config: xxChaos and Schedule(applied in pod kill chaos). if chaos_config["kind"] == "Schedule": for k, v in chaos_config["spec"].items(): if "Chaos" in k and "selector" in v.keys(): selector = v["selector"] break else: selector = chaos_config["spec"]["selector"] log.info(f"chaos target selector: {selector}") tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml' tests_config = cc.gen_experiment_config(tests_yaml) test_collections = tests_config.get('Collections', None) for t in test_collections: test_chaos = t.get('testcase', {}).get('chaos', {}) if test_chaos in chaos_yaml: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {}) # get the nums of pods namespace = selector["namespaces"][0] labels_dict = selector["labelSelectors"] labels_list = [] for k, v in labels_dict.items(): labels_list.append(k + "=" + v) labels_str = ",".join(labels_list) pods = get_pod_list(namespace, labels_str) # for the cluster_n_node if len(pods) > 1: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {}) log.info(f"yaml.expects: {expects}") self.expect_create = expects.get(Op.create.value, constants.SUCC) self.expect_insert = expects.get(Op.insert.value, constants.SUCC) self.expect_flush = expects.get(Op.flush.value, constants.SUCC) self.expect_index = expects.get(Op.index.value, constants.SUCC) self.expect_search = expects.get(Op.search.value, constants.SUCC) self.expect_query = expects.get(Op.query.value, constants.SUCC) log.info( f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, " f"flush:{self.expect_flush}, index:{self.expect_index}, " f"search:{self.expect_search}, query:{self.expect_query}") return True return False
def reboot_pod(chaos_yaml): # parse chaos object chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) # inject chaos chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") sleep(7) # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(meta_name) log.debug("chaos deleted")
def test_memory_stress_replicas_load_balance_single_node(self, prepare_collection): """ target: test apply memory stress on single node replica, and it OOMKilled method: 1.Deploy 2 querynodes and limit memory 6Gi 2.Loading 1000,000 entities (data_size=500Mb) with 2 replicas (memory_usage=1.5Gb) 3.Apply memory stress on one querynode and make it OOMKilled expected: After deleting chaos, querynode turns running, search successfully """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" query_res, _ = collection_w.query("int64 in [0]") assert len(query_res) != 0 # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") # Update config chaos_config['spec']['mode'] = "one" chaos_config['spec']['stressors']['memory']['size'] = '6Gi' chaos_config['spec']['duration'] = "1m" log.debug(chaos_config) duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # release and load again collection_w.release() collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_chaos_memory_stress_datanode(self, chaos_yaml): """ target: test inject memory stress into dataNode method: 1.Deploy milvus and limit datanode memory resource 2.Create collection and insert some data 3.Inject memory stress chaos 4.Continue to insert data expected: """ # init collection and insert 250 nb nb = 25000 dim = 512 c_name = cf.gen_unique_str('chaos_memory') collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(dim=dim)) for i in range(10): t0 = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb log.info( f'After {i + 1} insert, num_entities: {collection_w.num_entities}' ) tt = datetime.datetime.now() - t0 log.info(f"{i} insert and flush data cost: {tt}") # inject memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") # Continue to insert data collection_w.insert(df) log.info(f'Total num entities: {collection_w.num_entities}') # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name)
def test_chaos_memory_stress_replicas_OOM(self, prepare_collection, mode): """ target: test apply memory stress during loading, and querynode OOMKilled method: 1.Deploy and limit querynode memory limit 6Gi 2.Create collection and insert 1000,000 entities 3.Apply memory stress and querynode OOMKilled during loading replicas expected: Verify the mic is available to load and search querynode restart """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") chaos_config['spec']['mode'] = mode chaos_config['spec']['duration'] = '3m' chaos_config['spec']['stressors']['memory']['size'] = '6Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") collection_w.load(replica_number=2, timeout=60, _async=True) utility_w.wait_for_loading_complete(collection_w.name) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == '100%' sleep(180) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # TODO search failed search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0]) collection_w.release() collection_w.load(replica_number=2) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_memory_stress_replicas_group_sufficient(self, prepare_collection, mode): """ target: test apply stress memory on one querynode and the memory is enough to load replicas method: 1.Limit all querynodes memory 6Gi 2.Apply 3Gi memory stress on different number of querynodes (load whole collection need about 1.5GB) expected: Verify load successfully and search result are correct """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() # # apply memory stress chaos chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") chaos_config['spec']['mode'] = mode chaos_config['spec']['duration'] = '3m' chaos_config['spec']['stressors']['memory']['size'] = '3Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") sleep(20) # try: collection_w.load(replica_number=2, timeout=60) utility_w.loading_progress(collection_w.name) replicas, _ = collection_w.get_replicas() log.debug(replicas) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0]) collection_w.release() except Exception as e: raise Exception(str(e)) finally: # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) log.debug("Test finished")
def parser_testcase_config(self, chaos_yaml): tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml' tests_config = cc.gen_experiment_config(tests_yaml) test_collections = tests_config.get('Collections', None) ms = MilvusSys(alias="default") node_map = { "querynode": "query_nodes", "datanode": "data_nodes", "indexnode": "index_nodes", "proxy": "proxy_nodes" } for t in test_collections: test_chaos = t.get('testcase', {}).get('chaos', {}) if test_chaos in chaos_yaml: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {}) # for cluster_n_node mode for node in node_map.keys(): if node in test_chaos and len(getattr(ms, node_map[node])) > 1: expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {}) log.info(f"yaml.expects: {expects}") self.expect_create = expects.get(Op.create.value, constants.SUCC) self.expect_insert = expects.get(Op.insert.value, constants.SUCC) self.expect_flush = expects.get(Op.flush.value, constants.SUCC) self.expect_index = expects.get(Op.index.value, constants.SUCC) self.expect_search = expects.get(Op.search.value, constants.SUCC) self.expect_query = expects.get(Op.query.value, constants.SUCC) log.info( f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, " f"flush:{self.expect_flush}, index:{self.expect_index}, " f"search:{self.expect_search}, query:{self.expect_query}") return True return False
def test_multi_replicas_with_only_one_group_available( self, chaos_type, failed_node_type, failed_group_scope, is_streaming): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") # log.info(f"chaos_yaml: {chaos_yaml}") log.info(connections.get_connection_addr('default')) if is_streaming is False: del self.health_checkers[Op.insert] cc.start_monitor_threads(self.health_checkers) # get replicas info release_name = "milvus-multi-querynode" querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) group_list = [] shard_leader_list = [] replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() for g in replicas_info.groups: group_list.append(list(g.group_nodes)) for shard in g.shards: shard_leader_list.append(shard.shard_leader) # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos, # In the effected groups, each group has one pod is in pod failure status target_pod_list = [] target_group = [] group_list = sorted(group_list, key=lambda x: -len(x)) if failed_group_scope == "one": target_group = random.sample(group_list, 1) if failed_group_scope == "except_one": target_group = random.sample(group_list, len(group_list) - 1) if failed_group_scope == "all": target_group = group_list[:] for g in target_group: target_nodes = [] if failed_node_type == "shard_leader": target_nodes = list(set(g) & set(shard_leader_list)) if failed_node_type == "non_shard_leader": target_nodes = list(set(g) - set(shard_leader_list)) for target_node in target_nodes: pod = querynode_id_pod_pair[target_node] target_pod_list.append(pod) log.info(f"target_pod_list: {target_pod_list}") chaos_config = cc.gen_experiment_config( f"chaos/chaos_objects/template/{chaos_type}-by-pod-list.yaml") chaos_config['metadata'][ 'name'] = f"test-multi-replicase-{int(time.time())}" meta_name = chaos_config.get('metadata', None).get('name', None) chaos_config['spec']['selector']['pods'][ 'chaos-testing'] = target_pod_list self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 2) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic log.info("******2nd assert after chaos injected: ") expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC} if failed_group_scope == "all": expectations = { Op.search: constants.FAIL, Op.query: constants.FAIL } assert_statistic(self.health_checkers, expectations=expectations) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) ready_1 = wait_pods_ready( constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") if ready_1 and ready_2: log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) # cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) sleep(120) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos_data_consist(self, connection, chaos_yaml): """ target: verify data consistence after chaos injected and recovered method: 1. create a collection, insert some data, search and query 2. inject a chaos object 3. reconnect to service 4. verify a) data entities persists, index persists, b) search and query results persist expected: collection data and results persist """ c_name = cf.gen_unique_str('chaos_collection_') nb = 5000 i_name = cf.gen_unique_str('chaos_index_') index_params = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } # create t0 = datetime.datetime.now() collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema()) tt = datetime.datetime.now() - t0 log.info(f"assert create: {tt}") assert collection_w.name == c_name # insert data = cf.gen_default_list_data(nb=nb) t0 = datetime.datetime.now() _, res = collection_w.insert(data) tt = datetime.datetime.now() - t0 log.info(f"assert insert: {tt}") assert res # flush t0 = datetime.datetime.now() assert collection_w.num_entities == nb tt = datetime.datetime.now() - t0 log.info(f"assert flush: {tt}") # search collection_w.load() search_vectors = cf.gen_vectors(1, ct.default_dim) t0 = datetime.datetime.now() search_params = {"metric_type": "L2", "params": {"nprobe": 16}} search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # index t0 = datetime.datetime.now() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=index_params, name=i_name) tt = datetime.datetime.now() - t0 log.info(f"assert index: {tt}") assert len(collection_w.indexes) == 1 # query term_expr = f'{ct.default_int64_field_name} in [1001,1201,999,99]' t0 = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt = datetime.datetime.now() - t0 log.info(f"assert query: {tt}") assert len(query_res) == 4 # reboot a pod reboot_pod(chaos_yaml) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 3) reconnect(connections, alias='default') # verify collection persists assert utility.has_collection(c_name) log.info("assert collection persists") collection_w2 = ApiCollectionWrapper() collection_w2.init_collection(c_name) # verify data persist assert collection_w2.num_entities == nb log.info("assert data persists") # verify index persists assert collection_w2.has_index(i_name) log.info("assert index persists") # verify search results persist collection_w2.load() search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # verify query results persist query_res2, _ = collection_w2.query(term_expr) assert len(query_res2) == len(query_res) log.info("assert query result persists")
class TestChaos(TestChaosBase): @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) connections.connect(alias='default') if connections.has_connection("default") is False: raise Exception("no connections") self.host = host self.port = port self.instance_name = get_milvus_instance_name( constants.CHAOS_NAMESPACE, host) @pytest.fixture(scope="function", autouse=True) def init_health_checkers(self): c_name = cf.gen_unique_str('MultiReplicasChecker_') replicas_num = 2 shards_num = 2 checkers = { Op.insert: InsertFlushChecker(collection_name=c_name, shards_num=shards_num), Op.search: SearchChecker(collection_name=c_name, shards_num=shards_num, replica_number=replicas_num), Op.query: QueryChecker(collection_name=c_name, shards_num=shards_num, replica_number=replicas_num) } self.health_checkers = checkers def teardown(self): chaos_res = CusResource(kind=self._chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) meta_name = self._chaos_config.get('metadata', None).get('name', None) chaos_res.delete(meta_name, raise_ex=False) sleep(2) log.info(f'Alive threads: {threading.enumerate()}') @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize( "is_streaming", cc.gen_experiment_config(config_file_name)['is_streaming'] ) # [False, True] @pytest.mark.parametrize( "failed_group_scope", cc.gen_experiment_config(config_file_name)['failed_group_scope'] ) # ["one", "except_one" "all"] @pytest.mark.parametrize( "failed_node_type", cc.gen_experiment_config(config_file_name)['failed_node_type'] ) # ["non_shard_leader", "shard_leader"] @pytest.mark.parametrize( "chaos_type", cc.gen_experiment_config(config_file_name)['chaos_type'] ) # ["pod-failure", "pod-kill"] def test_multi_replicas_with_only_one_group_available( self, chaos_type, failed_node_type, failed_group_scope, is_streaming): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info("Test config") log.info(cc.gen_experiment_config(config_file_name)) # log.info(f"chaos_yaml: {chaos_yaml}") log.info(connections.get_connection_addr('default')) if is_streaming is False: del self.health_checkers[Op.insert] cc.start_monitor_threads(self.health_checkers) # get replicas info release_name = self.instance_name querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) group_list = [] shard_leader_list = [] replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() for g in replicas_info.groups: group_list.append(list(g.group_nodes)) for shard in g.shards: shard_leader_list.append(shard.shard_leader) # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos, # In the effected groups, each group has one pod is in pod failure status target_pod_list = [] target_group = [] group_list = sorted(group_list, key=lambda x: -len(x)) if failed_group_scope == "one": target_group = random.sample(group_list, 1) if failed_group_scope == "except_one": target_group = random.sample(group_list, len(group_list) - 1) if failed_group_scope == "all": target_group = group_list[:] for g in target_group: target_nodes = [] if failed_node_type == "shard_leader": target_nodes = list(set(g) & set(shard_leader_list)) if failed_node_type == "non_shard_leader": target_nodes = list(set(g) - set(shard_leader_list)) if len(target_nodes) == 0: log.info("there is no node satisfied, chose one randomly") target_nodes = [random.choice(g)] for target_node in target_nodes: pod = querynode_id_pod_pair[target_node] target_pod_list.append(pod) log.info(f"target_pod_list: {target_pod_list}") chaos_config = cc.gen_experiment_config( f"{str(Path(__file__).absolute().parent)}/chaos_objects/template/{chaos_type}-by-pod-list.yaml" ) chaos_config['metadata'][ 'name'] = f"test-multi-replicase-{int(time.time())}" meta_name = chaos_config.get('metadata', None).get('name', None) chaos_config['spec']['selector']['pods'][ 'chaos-testing'] = target_pod_list self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 2) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic log.info("******2nd assert after chaos injected: ") expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC} if failed_group_scope == "all": expectations = { Op.search: constants.FAIL, Op.query: constants.FAIL } assert_statistic(self.health_checkers, expectations=expectations) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) ready_1 = wait_pods_ready( constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") if ready_1 and ready_2: log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) # cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # node info querynode_id_pod_pair = get_querynode_info(release_name) log.info(querynode_id_pod_pair) sleep(30) # replicas info replicas_info, _ = self.health_checkers[ Op.search].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}" ) replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas() log.info( f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}" ) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) self.checker_threads = cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report meta_name = chaos_config.get('metadata', None).get('name', None) dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("1st assert before chaos: ") f.write(f"{self.health_checkers}\n") # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2.1) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) for k, t in self.checker_threads.items(): log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:") f.write(f"{self.health_checkers}\n") # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:") f.write(f"{self.health_checkers}\n") # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) release_name = meta_name chaos_config_str = json.dumps(chaos_config) chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name) chaos_config = json.loads(chaos_config_str) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml, chaos_config) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: ts = time.strftime("%Y-%m-%d %H:%M:%S") f.write(f"{meta_name}-{ts}\n") f.write("1st assert before chaos:\n") f.write(record_results(self.health_checkers)) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:\n") f.write(record_results(self.health_checkers)) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") log.info(f'Alive threads: {threading.enumerate()}') sleep(2) # wait all pods ready log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:\n") f.write(record_results(self.health_checkers)) # assert all expectations assert_expectations() log.info("*********************Chaos Test Completed**********************")
def test_bulk_load(self, chaos_type, target_component): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) release_name = self.instance_name cc.start_monitor_threads(self.health_checkers) chaos_config = cc.gen_experiment_config( f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml" ) chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}" kind = chaos_config['kind'] meta_name = chaos_config.get('metadata', None).get('name', None) update_key_value(chaos_config, "release", release_name) update_key_value(chaos_config, "app.kubernetes.io/instance", release_name) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 10) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 10) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.bulk_load: constants.FAIL, }) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # recheck failed tasks in third assert self.health_checkers[Op.bulk_load].recheck_failed_task = True # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 10) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")