예제 #1
0
    def test_memory_stress_replicas_group_load_balance(self, prepare_collection):
        """
        target: test apply memory stress on replicas and load balance inside group
        method: 1.Deploy milvus and limit querynode memory 6Gi
                2.Insret 1000,000 entities (500Mb), load 2 replicas (memory usage 1.5Gb)
                3.Apply memory stress 4Gi on querynode
        expected: Verify that load balancing occurs
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        release_name = "mic-memory"

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"

        # get the replica and random chaos querynode
        replicas, _ = collection_w.get_replicas()
        chaos_querynode_id = replicas.groups[0].group_nodes[0]
        label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode"
        querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label)
        chaos_querynode_pod = querynode_id_pod_pair[chaos_querynode_id]

        # get the segment num before chaos
        seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_before = cf.get_segment_distribution(seg_info_before)
        segments_num_before = len(seg_distribution_before[chaos_querynode_id]["sealed"])
        log.debug(segments_num_before)
        log.debug(seg_distribution_before[chaos_querynode_id]["sealed"])

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml")
        chaos_config['spec']['selector']['pods']['chaos-testing'] = [chaos_querynode_pod]
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug(f"Apply memory stress on querynode {chaos_querynode_id}, pod {chaos_querynode_pod}")

        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        sleep(eval(duration))

        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # Verfiy auto load loadbalance
        seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_after = cf.get_segment_distribution(seg_info_after)
        segments_num_after = len(seg_distribution_after[chaos_querynode_id]["sealed"])
        log.debug(segments_num_after)
        log.debug(seg_distribution_after[chaos_querynode_id]["sealed"])

        assert segments_num_after < segments_num_before
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
예제 #2
0
 def test_memory_stress_replicas_befor_load(self, prepare_collection):
     """
     target: test querynode group load with insufficient memory
     method: 1.Limit querynode memory ? 2Gi
             2.Load sealed data (needed memory > memory limit)
     expected: Raise an exception
     """
     collection_w = prepare_collection
     utility_w = ApiUtilityWrapper()
     err = {"err_code": 1, "err_msg": "xxxxxxxxx"}
     # collection_w.load(replica_number=2, timeout=60, check_task=CheckTasks.err_res, check_items=err)
     collection_w.load(replica_number=5)
     utility_w.loading_progress(collection_w.name)
     search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                         ct.default_float_vec_field_name, ct.default_search_params,
                                         ct.default_limit, timeout=60)
예제 #3
0
    def test_memory_stress_replicas_load_balance_single_node(self, prepare_collection):
        """
        target: test apply memory stress on single node replica, and it OOMKilled
        method: 1.Deploy 2 querynodes and limit memory 6Gi
                2.Loading 1000,000 entities (data_size=500Mb) with 2 replicas (memory_usage=1.5Gb)
                3.Apply memory stress on one querynode and make it OOMKilled
        expected: After deleting chaos, querynode turns running, search successfully
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        query_res, _ = collection_w.query("int64 in [0]")
        assert len(query_res) != 0

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")

        # Update config
        chaos_config['spec']['mode'] = "one"
        chaos_config['spec']['stressors']['memory']['size'] = '6Gi'
        chaos_config['spec']['duration'] = "1m"
        log.debug(chaos_config)
        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)

        sleep(eval(duration))
        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # release and load again
        collection_w.release()
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
예제 #4
0
    def test_memory_stress_replicas_group_sufficient(self, prepare_collection, mode):
        """
        target: test apply stress memory on one querynode and the memory is enough to load replicas
        method: 1.Limit all querynodes memory 6Gi
                2.Apply 3Gi memory stress on different number of querynodes (load whole collection need about 1.5GB)
        expected: Verify load successfully and search result are correct
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        # # apply memory stress chaos
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['duration'] = '3m'
        chaos_config['spec']['stressors']['memory']['size'] = '3Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        sleep(20)
        #
        try:
            collection_w.load(replica_number=2, timeout=60)
            utility_w.loading_progress(collection_w.name)
            replicas, _ = collection_w.get_replicas()
            log.debug(replicas)
            search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                                ct.default_float_vec_field_name, ct.default_search_params,
                                                ct.default_limit, timeout=120)
            assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
            collection_w.release()

        except Exception as e:
            raise Exception(str(e))

        finally:
            # delete chaos
            meta_name = chaos_config.get('metadata', None).get('name', None)
            chaos_res.delete(metadata_name=meta_name)
            log.debug("Test finished")
예제 #5
0
    def test_memory_stress_replicas_cross_group_load_balance(self, prepare_collection):
        """
        target: test apply memory stress on one group and no load balance cross replica groups
        method: 1.Limit all querynodes memory 6Gi
                2.Create and insert 1000,000 entities
                3.Load collection with two replicas
                4.Apply memory stress on one grooup 80%
        expected: Verify that load balancing across groups is not occurring
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        release_name = "mic-memory"

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name)

        # get the replica and random chaos querynode
        replicas, _ = collection_w.get_replicas()
        group_nodes = list(replicas.groups[0].group_nodes)
        label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode"
        querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label)
        group_nodes_pod = [querynode_id_pod_pair[node_id] for node_id in group_nodes]

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml")
        chaos_config['spec']['selector']['pods']['chaos-testing'] = group_nodes_pod
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug(f"Apply memory stress on querynode {group_nodes}, pod {group_nodes_pod}")

        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        sleep(eval(duration))

        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # Verfiy auto load loadbalance
        seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_before = cf.get_segment_distribution(seg_info_before)
        seg_distribution_after = cf.get_segment_distribution(seg_info_after)
        for node_id in group_nodes:
            assert len(seg_distribution_before[node_id]) == len(seg_distribution_after[node_id])

        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
예제 #6
0
    def test_memory_stress_replicas_group_insufficient(self, prepare_collection, mode):
        """
        target: test apply stress memory on different number querynodes and the group failed to load,
                bacause of the memory is insufficient
        method: 1.Limit querynodes memory 5Gi
                2.Create collection and insert 1000,000 entities
                3.Apply memory stress on querynodes and it's memory is not enough to load replicas
        expected: Verify load raise exception, and after delete chaos, load and search successfully
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")

        # Update config
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['stressors']['memory']['size'] = '5Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        # chaos_start = time.time()
        log.debug("chaos injected")
        sleep(10)

        try:
            # load failed
            err = {"err_code": 1, "err_msg": "shuffleSegmentsToQueryNodeV2: insufficient memory of available node"}
            collection_w.load(replica_number=5, timeout=60, check_task=CheckTasks.err_res, check_items=err)

            # query failed because not loaded
            err = {"err_code": 1, "err_msg": "not loaded into memory"}
            collection_w.query("int64 in [0]", check_task=CheckTasks.err_res, check_items=err)

            # delete chaos
            meta_name = chaos_config.get('metadata', None).get('name', None)
            chaos_res.delete(metadata_name=meta_name)
            sleep(10)

            # after delete chaos load and query successfully
            collection_w.load(replica_number=5, timeout=60)
            progress, _ = utility_w.loading_progress(collection_w.name)
            # assert progress["loading_progress"] == "100%"
            query_res, _ = collection_w.query("int64 in [0]")
            assert len(query_res) != 0

            collection_w.release()

        except Exception as e:
            raise Exception(str(e))

        finally:
            log.debug("Test finished")
예제 #7
0
    def test_chaos_memory_stress_replicas_OOM(self, prepare_collection, mode):
        """
        target: test apply memory stress during loading, and querynode OOMKilled
        method: 1.Deploy and limit querynode memory limit 6Gi
                2.Create collection and insert 1000,000 entities
                3.Apply memory stress and querynode OOMKilled during loading replicas
        expected: Verify the mic is available to load and search querynode restart
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['duration'] = '3m'
        chaos_config['spec']['stressors']['memory']['size'] = '6Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)

        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        collection_w.load(replica_number=2, timeout=60, _async=True)

        utility_w.wait_for_loading_complete(collection_w.name)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == '100%'

        sleep(180)
        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # TODO search failed
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])

        collection_w.release()
        collection_w.load(replica_number=2)
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])