예제 #1
0
    def test_auto_load_balance(self):
        """

        """
        log.info(f"start to install milvus")
        release_name, host, port = install_milvus(
            "test-auto-load-balance")  # todo add release name
        self.release_name = release_name
        assert host is not None
        conn = connections.connect("default", host=host, port=port)
        assert conn is not None
        self.health_checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        cc.start_monitor_threads(self.health_checkers)
        # wait
        sleep(constants.WAIT_PER_OP * 10)
        all_collections = list_collections()
        for c in all_collections:
            seg_info = utility.get_query_segment_info(c)
            seg_distribution = cf.get_segment_distribution(seg_info)
            for k in seg_distribution.keys():
                log.info(
                    f"collection {c}'s segment distribution in node {k} is {seg_distribution[k]['sealed']}"
                )
        # first assert
        log.info("first assert")
        assert_statistic(self.health_checkers)

        # scale up
        log.info("scale up milvus")
        scale_up_milvus(self.release_name)
        # reset counting
        cc.reset_counting(self.health_checkers)
        sleep(constants.WAIT_PER_OP * 10)
        all_collections = list_collections()
        for c in all_collections:
            seg_info = utility.get_query_segment_info(c)
            seg_distribution = cf.get_segment_distribution(seg_info)
            for k in seg_distribution.keys():
                log.info(
                    f"collection {c}'s sealed segment distribution in node {k} is {seg_distribution[k]['sealed']}"
                )
        # second assert
        log.info("second assert")
        assert_statistic(self.health_checkers)

        # TODO assert segment distribution

        # assert all expectations
        assert_expectations()
    def test_chaos_memory_stress_etcd(self, chaos_yaml):
        """
        target: test inject memory stress into all etcd pods
        method: 1.Deploy milvus and limit etcd memory resource 1Gi witl all mode
                2.Continuously and concurrently do milvus operations
                3.Inject memory stress chaos 51024Mi
                4.After duration, delete chaos stress
        expected: Verify milvus operation succ rate
        """
        mic_checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        # start thread keep running milvus op
        start_monitor_threads(mic_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        # duration = chaos_config["spec"]["duration"]
        meta_name = chaos_config.get('metadata').get('name')
        duration = chaos_config.get('spec').get('duration')

        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("Chaos injected")

        # convert string duration time to a int number in seconds
        if isinstance(duration, str):
            duration = duration.replace('h', '*3600+').replace(
                'm', '*60+').replace('s', '*1+') + '+0'
        else:
            log.error("Duration must be string type")

        # Delete experiment after it's over
        timer = threading.Timer(interval=eval(duration),
                                function=chaos_res.delete,
                                args=(meta_name, False))
        timer.start()
        timer.join()

        # output milvus op succ rate
        for k, ch in mic_checkers.items():
            log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}')
            assert ch.succ_rate() == 1.0
예제 #3
0
    def test_operations(self, collection_name):
        # start the monitor threads to check the milvus ops
        log.info("*********************Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        c_name = collection_name
        self.init_health_checkers(collection_name=c_name)
        cc.start_monitor_threads(self.health_checkers)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)
        # assert all expectations
        assert_statistic(self.health_checkers)
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
예제 #4
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)
        release_name = meta_name
        chaos_config_str = json.dumps(chaos_config)
        chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
        chaos_config = json.loads(chaos_config_str)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False

        # init report
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            ts = time.strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"{meta_name}-{ts}\n")
            f.write("1st assert before chaos:\n")
            f.write(record_results(self.health_checkers))
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        log.info(f'Alive threads: {threading.enumerate()}')

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={Op.create: self.expect_create,
                                       Op.insert: self.expect_insert,
                                       Op.flush: self.expect_flush,
                                       Op.index: self.expect_index,
                                       Op.search: self.expect_search,
                                       Op.query: self.expect_query
                                       })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:\n")
            f.write(record_results(self.health_checkers))
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        log.info(f'Alive threads: {threading.enumerate()}')
        sleep(2)
        # wait all pods ready
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:\n")
            f.write(record_results(self.health_checkers))
        # assert all expectations
        assert_expectations()

        log.info("*********************Chaos Test Completed**********************")
예제 #5
0
    def test_multi_replicas_with_only_one_group_available(
            self, chaos_type, failed_node_type, failed_group_scope,
            is_streaming):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info("Test config")
        log.info(cc.gen_experiment_config(config_file_name))
        # log.info(f"chaos_yaml: {chaos_yaml}")
        log.info(connections.get_connection_addr('default'))
        if is_streaming is False:
            del self.health_checkers[Op.insert]
        cc.start_monitor_threads(self.health_checkers)
        # get replicas info
        release_name = self.instance_name
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        group_list = []
        shard_leader_list = []
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        for g in replicas_info.groups:
            group_list.append(list(g.group_nodes))
            for shard in g.shards:
                shard_leader_list.append(shard.shard_leader)
        # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos,
        # In the effected groups, each group has one pod is in pod failure status
        target_pod_list = []
        target_group = []
        group_list = sorted(group_list, key=lambda x: -len(x))
        if failed_group_scope == "one":
            target_group = random.sample(group_list, 1)
        if failed_group_scope == "except_one":
            target_group = random.sample(group_list, len(group_list) - 1)
        if failed_group_scope == "all":
            target_group = group_list[:]
        for g in target_group:
            target_nodes = []
            if failed_node_type == "shard_leader":
                target_nodes = list(set(g) & set(shard_leader_list))
            if failed_node_type == "non_shard_leader":
                target_nodes = list(set(g) - set(shard_leader_list))
            if len(target_nodes) == 0:
                log.info("there is no node satisfied, chose one randomly")
                target_nodes = [random.choice(g)]
            for target_node in target_nodes:
                pod = querynode_id_pod_pair[target_node]
                target_pod_list.append(pod)
        log.info(f"target_pod_list: {target_pod_list}")
        chaos_config = cc.gen_experiment_config(
            f"{str(Path(__file__).absolute().parent)}/chaos_objects/template/{chaos_type}-by-pod-list.yaml"
        )
        chaos_config['metadata'][
            'name'] = f"test-multi-replicase-{int(time.time())}"
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_config['spec']['selector']['pods'][
            'chaos-testing'] = target_pod_list
        self._chaos_config = chaos_config  # cache the chaos config for tear down

        log.info(f"chaos_config: {chaos_config}")
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)

        chaos_res.create(chaos_config)
        log.info("chaos injected")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 120s
        sleep(constants.CHAOS_DURATION)
        log.info(f'Alive threads: {threading.enumerate()}')
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC}
        if failed_group_scope == "all":
            expectations = {
                Op.search: constants.FAIL,
                Op.query: constants.FAIL
            }
        assert_statistic(self.health_checkers, expectations=expectations)
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        sleep(2)
        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}"
        )
        ready_1 = wait_pods_ready(
            constants.CHAOS_NAMESPACE,
            f"app.kubernetes.io/instance={release_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}"
        )
        ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE,
                                  f"release={release_name}")
        if ready_1 and ready_2:
            log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        # cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        sleep(30)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )
        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
예제 #6
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        self.checker_threads = cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")

        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False
        # init report
        meta_name = chaos_config.get('metadata', None).get('name', None)
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("1st assert before chaos: ")
            f.write(f"{self.health_checkers}\n")
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2.1)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        for k, t in self.checker_threads.items():
            log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}")

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.create: self.expect_create,
                             Op.insert: self.expect_insert,
                             Op.flush: self.expect_flush,
                             Op.index: self.expect_index,
                             Op.search: self.expect_search,
                             Op.query: self.expect_query
                         })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:")
            f.write(f"{self.health_checkers}\n")
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        for k, t in self.checker_threads.items():
            log.info(f"Thread {k} is_alive(): {t.is_alive()}")
        sleep(2)

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')

        # reset counting again
        cc.reset_counting(self.health_checkers)

        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)

        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:")
            f.write(f"{self.health_checkers}\n")
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
예제 #7
0
    def test_bulk_load(self, chaos_type, target_component):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        release_name = self.instance_name
        cc.start_monitor_threads(self.health_checkers)
        chaos_config = cc.gen_experiment_config(
            f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml"
        )
        chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}"
        kind = chaos_config['kind']
        meta_name = chaos_config.get('metadata', None).get('name', None)
        update_key_value(chaos_config, "release", release_name)
        update_key_value(chaos_config, "app.kubernetes.io/instance",
                         release_name)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # wait 20s
        sleep(constants.WAIT_PER_OP * 10)
        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        sleep(constants.WAIT_PER_OP * 10)
        # reset counting
        cc.reset_counting(self.health_checkers)
        # wait 120s
        sleep(constants.CHAOS_DURATION)
        log.info(f'Alive threads: {threading.enumerate()}')
        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.bulk_load: constants.FAIL,
                         })
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        sleep(2)
        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE,
                        f"app.kubernetes.io/instance={release_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # recheck failed tasks in third assert
        self.health_checkers[Op.bulk_load].recheck_failed_task = True
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 10)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")