def test_memory_stress_replicas_group_load_balance(self, prepare_collection): """ target: test apply memory stress on replicas and load balance inside group method: 1.Deploy milvus and limit querynode memory 6Gi 2.Insret 1000,000 entities (500Mb), load 2 replicas (memory usage 1.5Gb) 3.Apply memory stress 4Gi on querynode expected: Verify that load balancing occurs """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() chaos_querynode_id = replicas.groups[0].group_nodes[0] label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) chaos_querynode_pod = querynode_id_pod_pair[chaos_querynode_id] # get the segment num before chaos seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) segments_num_before = len(seg_distribution_before[chaos_querynode_id]["sealed"]) log.debug(segments_num_before) log.debug(seg_distribution_before[chaos_querynode_id]["sealed"]) # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = [chaos_querynode_pod] log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {chaos_querynode_id}, pod {chaos_querynode_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_after = cf.get_segment_distribution(seg_info_after) segments_num_after = len(seg_distribution_after[chaos_querynode_id]["sealed"]) log.debug(segments_num_after) log.debug(seg_distribution_after[chaos_querynode_id]["sealed"]) assert segments_num_after < segments_num_before search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
class LoadBalanceChecker(Checker): """check loadbalance operations in a dependent thread""" def __init__(self, collection_name=None): super().__init__(collection_name=collection_name) self.utility_wrap = ApiUtilityWrapper() self.c_wrap.load(enable_traceback=enable_traceback) def keep_running(self): while True: c_name = self.c_wrap.name res, _ = self.c_wrap.get_replicas() # prepare load balance params # find a group which has multi nodes group_nodes = [] for g in res.groups: if len(g.group_nodes) >= 2: group_nodes = list(g.group_nodes) break src_node_id = group_nodes[0] dst_node_ids = group_nodes[1:] res, _ = self.utility_wrap.get_query_segment_info(c_name) segment_distribution = cf.get_segment_distribution(res) sealed_segment_ids = segment_distribution[src_node_id]["sealed"] # load balance t0 = time.time() _, result = self.utility_wrap.load_balance(c_name, src_node_id, dst_node_ids, sealed_segment_ids) t1 = time.time() # get segments distribution after load balance time.sleep(3) res, _ = self.utility_wrap.get_query_segment_info(c_name) segment_distribution = cf.get_segment_distribution(res) sealed_segment_ids_after_load_banalce = segment_distribution[ src_node_id]["sealed"] check_1 = len( set(sealed_segment_ids) & set(sealed_segment_ids_after_load_banalce)) == 0 des_sealed_segment_ids = [] for des_node_id in dst_node_ids: des_sealed_segment_ids += segment_distribution[des_node_id][ "sealed"] # assert sealed_segment_ids is subset of des_sealed_segment_ids check_2 = set(sealed_segment_ids).issubset( set(des_sealed_segment_ids)) if result and (check_1 and check_2): self.rsp_times.append(t1 - t0) self.average_time = ( (t1 - t0) + self.average_time * self._succ) / (self._succ + 1) self._succ += 1 log.debug( f"load balance success, time: {t1 - t0:.4f}, average_time: {self.average_time:.4f}" ) else: self._fail += 1 sleep(10)
def test_memory_stress_replicas_cross_group_load_balance(self, prepare_collection): """ target: test apply memory stress on one group and no load balance cross replica groups method: 1.Limit all querynodes memory 6Gi 2.Create and insert 1000,000 entities 3.Load collection with two replicas 4.Apply memory stress on one grooup 80% expected: Verify that load balancing across groups is not occurring """ collection_w = prepare_collection utility_w = ApiUtilityWrapper() release_name = "mic-memory" # load and searchc collection_w.load(replica_number=2) progress, _ = utility_w.loading_progress(collection_w.name) assert progress["loading_progress"] == "100%" seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name) # get the replica and random chaos querynode replicas, _ = collection_w.get_replicas() group_nodes = list(replicas.groups[0].group_nodes) label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode" querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label) group_nodes_pod = [querynode_id_pod_pair[node_id] for node_id in group_nodes] # apply memory stress chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml") chaos_config['spec']['selector']['pods']['chaos-testing'] = group_nodes_pod log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug(f"Apply memory stress on querynode {group_nodes}, pod {group_nodes_pod}") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0' sleep(eval(duration)) chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None)) # Verfiy auto load loadbalance seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name) seg_distribution_before = cf.get_segment_distribution(seg_info_before) seg_distribution_after = cf.get_segment_distribution(seg_info_after) for node_id in group_nodes: assert len(seg_distribution_before[node_id]) == len(seg_distribution_after[node_id]) search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, timeout=120) assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
class CompactChecker(Checker): """check compact operations in a dependent thread""" def __init__(self, collection_name=None): super().__init__(collection_name=collection_name) self.ut = ApiUtilityWrapper() self.c_wrap.load( enable_traceback=enable_traceback) # load before compact def keep_running(self): while True: seg_info = self.ut.get_query_segment_info(self.c_wrap.name) t0 = time.time() res, result = self.c_wrap.compact(timeout=timeout) print(f"compact done: res {res}") self.c_wrap.wait_for_compaction_completed() self.c_wrap.get_compaction_plans() t1 = time.time() if result: self.rsp_times.append(t1 - t0) self.average_time = ( (t1 - t0) + self.average_time * self._succ) / (self._succ + 1) self._succ += 1 log.debug( f"compact success, time: {t1 - t0:.4f}, average_time: {self.average_time:.4f}" ) else: self._fail += 1 sleep(constants.WAIT_PER_OP / 10)
class LoadBalanceChecker(Checker): """check loadbalance operations in a dependent thread""" def __init__(self, collection_name=None): if collection_name is None: collection_name = cf.gen_unique_str("LoadBalanceChecker_") super().__init__(collection_name=collection_name) self.utility_wrap = ApiUtilityWrapper() self.c_wrap.load() self.sealed_segment_ids = None self.dst_node_ids = None self.src_node_id = None @trace() def load_balance(self): res, result = self.utility_wrap.load_balance( self.c_wrap.name, self.src_node_id, self.dst_node_ids, self.sealed_segment_ids) return res, result def prepare(self): """prepare load balance params""" res, _ = self.c_wrap.get_replicas() # find a group which has multi nodes group_nodes = [] for g in res.groups: if len(g.group_nodes) >= 2: group_nodes = list(g.group_nodes) break self.src_node_id = group_nodes[0] self.dst_node_ids = group_nodes[1:] res, _ = self.utility_wrap.get_query_segment_info(self.c_wrap.name) segment_distribution = cf.get_segment_distribution(res) self.sealed_segment_ids = segment_distribution[self.src_node_id]["sealed"] @exception_handler() def run_task(self): self.prepare() res, result = self.load_balance() return res, result def keep_running(self): while self._keep_running: self.run_task() sleep(constants.WAIT_PER_OP / 10)
def test_customize_segment_size(self, seg_size, seg_count): """ steps """ log.info(f"start to install milvus with segment size {seg_size}") release_name, host, port = _install_milvus(seg_size) self.release_name = release_name assert host is not None conn = connections.connect("default", host=host, port=port) assert conn is not None mil = MilvusSys(alias="default") log.info(f"milvus build version: {mil.build_version}") log.info(f"start to e2e verification: {seg_size}") # create name = cf.gen_unique_str("segsiz") t0 = time.time() collection_w = ApiCollectionWrapper() collection_w.init_collection(name=name, schema=cf.gen_default_collection_schema(), timeout=40) tt = time.time() - t0 assert collection_w.name == name entities = collection_w.num_entities log.info(f"assert create collection: {tt}, init_entities: {entities}") # insert nb = 50000 data = cf.gen_default_list_data(nb=nb) t0 = time.time() _, res = collection_w.insert(data) tt = time.time() - t0 log.info(f"assert insert: {tt}") assert res # insert 2 million entities rounds = 40 for _ in range(rounds - 1): _, res = collection_w.insert(data) entities = collection_w.num_entities assert entities == nb * rounds # load collection_w.load() utility_wrap = ApiUtilityWrapper() segs, _ = utility_wrap.get_query_segment_info(collection_w.name) log.info(f"assert segments: {len(segs)}") assert len(segs) == seg_count # search search_vectors = cf.gen_vectors(1, ct.default_dim) search_params = {"metric_type": "L2", "params": {"nprobe": 16}} t0 = time.time() res_1, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1, timeout=30) tt = time.time() - t0 log.info(f"assert search: {tt}") assert len(res_1) == 1 collection_w.release() # index d = cf.gen_default_list_data() collection_w.insert(d) log.info(f"assert index entities: {collection_w.num_entities}") _index_params = { "index_type": "IVF_SQ8", "params": { "nlist": 64 }, "metric_type": "L2" } t0 = time.time() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=_index_params, name=cf.gen_unique_str(), timeout=120) tt = time.time() - t0 log.info(f"assert index: {tt}") assert len(collection_w.indexes) == 1 # search t0 = time.time() collection_w.load() tt = time.time() - t0 log.info(f"assert load: {tt}") search_vectors = cf.gen_vectors(1, ct.default_dim) t0 = time.time() res_1, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1, timeout=30) tt = time.time() - t0 log.info(f"assert search: {tt}") # query term_expr = f'{ct.default_int64_field_name} in [1001,1201,4999,2999]' t0 = time.time() res, _ = collection_w.query(term_expr, timeout=30) tt = time.time() - t0 log.info(f"assert query result {len(res)}: {tt}")
def test_scale_in_query_node_less_than_replicas(self): """ target: test scale in cluster and querynode < replica method: 1.Deploy cluster with 3 querynodes 2.Create and insert data, flush 3.Load collection with 2 replica number 4.Scale in querynode from 3 to 1 and query 5.Scale out querynode from 1 back to 3 expected: Verify search successfully after scale out """ release_name = "scale-in-query" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' query_config = { 'metadata.namespace': constants.NAMESPACE, 'metadata.name': release_name, 'spec.mode': 'cluster', 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.queryNode.replicas': 2, 'spec.config.common.retentionDuration': 60 } mic = MilvusOperator() mic.install(query_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1800): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: raise MilvusException(message=f'Milvus healthy timeout 1800s') try: # prepare collection connections.connect("scale-in", host=host, port=19530) utility_w = ApiUtilityWrapper() collection_w = ApiCollectionWrapper() collection_w.init_collection( name=cf.gen_unique_str("scale_in"), schema=cf.gen_default_collection_schema(), using="scale-in") collection_w.insert(cf.gen_default_dataframe_data()) assert collection_w.num_entities == ct.default_nb # load multi replicas and search success collection_w.load(replica_number=2) search_res, is_succ = collection_w.search( cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit) assert len(search_res[0].ids) == ct.default_limit log.info("Search successfully after load with 2 replicas") log.debug(collection_w.get_replicas()[0]) log.debug( utility_w.get_query_segment_info(collection_w.name, using="scale-in")) # scale in querynode from 2 to 1, less than replica number log.debug("Scale in querynode from 2 to 1") mic.upgrade(release_name, {'spec.components.queryNode.replicas': 1}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") # search and not assure success collection_w.search(cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit, check_task=CheckTasks.check_nothing) log.debug( collection_w.get_replicas( check_task=CheckTasks.check_nothing)[0]) # scale querynode from 1 back to 2 mic.upgrade(release_name, {'spec.components.queryNode.replicas': 2}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") # verify search success collection_w.search(cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit) # Verify replica info is correct replicas = collection_w.get_replicas()[0] assert len(replicas.groups) == 2 for group in replicas.groups: assert len(group.group_nodes) == 1 # Verify loaded segment info is correct seg_info = utility_w.get_query_segment_info(collection_w.name, using="scale-in")[0] num_entities = 0 for seg in seg_info: assert len(seg.nodeIds) == 2 num_entities += seg.num_rows assert num_entities == ct.default_nb except Exception as e: raise Exception(str(e)) finally: label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)