def test_memory_stress_replicas_group_load_balance(self, prepare_collection): """ target: test apply memory stress on replicas and load balance inside group method: 1.Deploy milvus and limit querynode memory 6Gi 2.Insret 1000,000 entities (500Mb), load 2 replicas (memory usage 1.5Gb) 3.Apply memory stress 4Gi on querynode expected: Verify that load balancing occurs """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() chaos_querynode_id = replicas.groups[0].group_nodes[0] label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) chaos_querynode_pod = querynode_id_pod_pair[chaos_querynode_id] # get the segment num before chaos seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) segments_num_before = len(seg_distribution_before[chaos_querynode_id]["sealed"]) log.debug(segments_num_before) log.debug(seg_distribution_before[chaos_querynode_id]["sealed"]) # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = [chaos_querynode_pod] log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {chaos_querynode_id}, pod {chaos_querynode_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_after = cf.get_segment_distribution(seg_info_after) segments_num_after = len(seg_distribution_after[chaos_querynode_id]["sealed"]) log.debug(segments_num_after) log.debug(seg_distribution_after[chaos_querynode_id]["sealed"]) assert segments_num_after < segments_num_before search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_memory_stress_replicas_befor_load(self, prepare_collection): """ target: test querynode group load with insufficient memory method: 1.Limit querynode memory ? 2Gi 2.Load sealed data (needed memory > memory limit) expected: Raise an exception """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() err = {"err_code": 1, "err_msg": "xxxxxxxxx"} # collection_w.load(replica_number=2, timeout=60, check_task=CheckTasks.err_res, check_items=err) collection_w.load(replica_number=5) utility_w.loading_progress(collection_w.name) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=60)
def test_memory_stress_replicas_load_balance_single_node(self, prepare_collection): """ target: test apply memory stress on single node replica, and it OOMKilled method: 1.Deploy 2 querynodes and limit memory 6Gi 2.Loading 1000,000 entities (data_size=500Mb) with 2 replicas (memory_usage=1.5Gb) 3.Apply memory stress on one querynode and make it OOMKilled expected: After deleting chaos, querynode turns running, search successfully """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" query_res, _ = collection_w.query("int64 in [0]") assert len(query_res) != 0 # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") # Update config chaos_config['spec']['mode'] = "one" chaos_config['spec']['stressors']['memory']['size'] = '6Gi' chaos_config['spec']['duration'] = "1m" log.debug(chaos_config) duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # release and load again collection_w.release() collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_memory_stress_replicas_group_sufficient(self, prepare_collection, mode): """ target: test apply stress memory on one querynode and the memory is enough to load replicas method: 1.Limit all querynodes memory 6Gi 2.Apply 3Gi memory stress on different number of querynodes (load whole collection need about 1.5GB) expected: Verify load successfully and search result are correct """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() # # apply memory stress chaos chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") chaos_config['spec']['mode'] = mode chaos_config['spec']['duration'] = '3m' chaos_config['spec']['stressors']['memory']['size'] = '3Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") sleep(20) # try: collection_w.load(replica_number=2, timeout=60) utility_w.loading_progress(collection_w.name) replicas, _ = collection_w.get_replicas() log.debug(replicas) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0]) collection_w.release() except Exception as e: raise Exception(str(e)) finally: # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) log.debug("Test finished")
def test_memory_stress_replicas_cross_group_load_balance(self, prepare_collection): """ target: test apply memory stress on one group and no load balance cross replica groups method: 1.Limit all querynodes memory 6Gi 2.Create and insert 1000,000 entities 3.Load collection with two replicas 4.Apply memory stress on one grooup 80% expected: Verify that load balancing across groups is not occurring """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() group_nodes = list(replicas.groups[0].group_nodes) label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) group_nodes_pod = [querynode_id_pod_pair[node_id] for node_id in group_nodes] # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = group_nodes_pod log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {group_nodes}, pod {group_nodes_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) seg_distribution_after = cf.get_segment_distribution(seg_info_after) for node_id in group_nodes: assert len(seg_distribution_before[node_id]) == len(seg_distribution_after[node_id]) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
def test_memory_stress_replicas_group_insufficient(self, prepare_collection, mode): """ target: test apply stress memory on different number querynodes and the group failed to load, bacause of the memory is insufficient method: 1.Limit querynodes memory 5Gi 2.Create collection and insert 1000,000 entities 3.Apply memory stress on querynodes and it's memory is not enough to load replicas expected: Verify load raise exception, and after delete chaos, load and search successfully """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") # Update config chaos_config['spec']['mode'] = mode chaos_config['spec']['stressors']['memory']['size'] = '5Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) # chaos_start = time.time() log.debug("chaos injected") sleep(10) try: # load failed err = {"err_code": 1, "err_msg": "shuffleSegmentsToQueryNodeV2: insufficient memory of available node"} collection_w.load(replica_number=5, timeout=60, check_task=CheckTasks.err_res, check_items=err) # query failed because not loaded err = {"err_code": 1, "err_msg": "not loaded into memory"} collection_w.query("int64 in [0]", check_task=CheckTasks.err_res, check_items=err) # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) sleep(10) # after delete chaos load and query successfully collection_w.load(replica_number=5, timeout=60) progress, _ = utility_w.loading_progress(collection_w.name) # assert progress["loading_progress"] == "100%" query_res, _ = collection_w.query("int64 in [0]") assert len(query_res) != 0 collection_w.release() except Exception as e: raise Exception(str(e)) finally: log.debug("Test finished")
def test_chaos_memory_stress_replicas_OOM(self, prepare_collection, mode): """ target: test apply memory stress during loading, and querynode OOMKilled method: 1.Deploy and limit querynode memory limit 6Gi 2.Create collection and insert 1000,000 entities 3.Apply memory stress and querynode OOMKilled during loading replicas expected: Verify the mic is available to load and search querynode restart """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml") chaos_config['spec']['mode'] = mode chaos_config['spec']['duration'] = '3m' chaos_config['spec']['stressors']['memory']['size'] = '6Gi' log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") collection_w.load(replica_number=2, timeout=60, _async=True) utility_w.wait_for_loading_complete(collection_w.name) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == '100%' sleep(180) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # TODO search failed search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0]) collection_w.release() collection_w.load(replica_number=2) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])