コード例 #1
0
class TestChaos(TestChaosBase):

    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        conn = connections.connect(alias='default')
        if conn is None:
            raise Exception("no connections")
        self.host = host
        self.port = port
        return conn

    @pytest.fixture(scope="function", autouse=True)
    def init_health_checkers(self):
        checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        self.health_checkers = checkers

    def teardown(self):
        chaos_res = CusResource(kind=self._chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        meta_name = self._chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(meta_name, raise_ex=False)
        sleep(2)
        log.info(f'Alive threads: {threading.enumerate()}')

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)
        release_name = meta_name
        chaos_config_str = json.dumps(chaos_config)
        chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
        chaos_config = json.loads(chaos_config_str)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False

        # init report
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            ts = time.strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"{meta_name}-{ts}\n")
            f.write("1st assert before chaos:\n")
            f.write(record_results(self.health_checkers))
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        log.info(f'Alive threads: {threading.enumerate()}')

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={Op.create: self.expect_create,
                                       Op.insert: self.expect_insert,
                                       Op.flush: self.expect_flush,
                                       Op.index: self.expect_index,
                                       Op.search: self.expect_search,
                                       Op.query: self.expect_query
                                       })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:\n")
            f.write(record_results(self.health_checkers))
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        log.info(f'Alive threads: {threading.enumerate()}')
        sleep(2)
        # wait all pods ready
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:\n")
            f.write(record_results(self.health_checkers))
        # assert all expectations
        assert_expectations()

        log.info("*********************Chaos Test Completed**********************")
コード例 #2
0
class TestChaosData:

    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        conn = connections.connect(alias='default')
        if conn is None:
            raise Exception("no connections")
        return conn

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_querynode(self, connection, chaos_yaml):
        """
        target: explore query node behavior after memory stress chaos injected and recovered
        method: 1. create a collection, insert some data
                2. inject memory stress chaos
                3. load collection and search, query
                4. todo (verify query node response)
                5. delete chaos or chaos finished
                6. release and reload collection, verify search and query is available
        expected: after chaos deleted, load, search and query are all available
        """
        c_name = 'chaos_memory_nx6DNW4q'
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(c_name)
        log.debug(collection_w.schema)
        log.debug(collection_w._shards_num)

        # apply memory stress
        # apply_memory_stress(chaos_yaml)

        # wait memory stress
        # sleep(constants.WAIT_PER_OP * 2)

        # query
        collection_w.release()
        collection_w.load()
        term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]'
        for i in range(4):
            t0_query = datetime.datetime.now()
            query_res, _ = collection_w.query(term_expr)
            tt_query = datetime.datetime.now() - t0_query
            log.info(f"{i} query cost: {tt_query}")
            assert len(query_res) == 4

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_datanode(self, chaos_yaml):
        """
        target: test inject memory stress into dataNode
        method: 1.Deploy milvus and limit datanode memory resource
                2.Create collection and insert some data
                3.Inject memory stress chaos
                4.Continue to insert data
        expected:
        """
        # init collection and insert 250 nb
        nb = 25000
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(name=c_name,
                                     schema=cf.gen_default_collection_schema(dim=dim))
        for i in range(10):
            t0 = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb
            log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}')
            tt = datetime.datetime.now() - t0
            log.info(f"{i} insert and flush data cost: {tt}")

        # inject memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")

        # Continue to insert data
        collection_w.insert(df)
        log.info(f'Total num entities: {collection_w.num_entities}')

        # delete chaos
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(metadata_name=meta_name)

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml):
        """
        target: test inject memory stress into indexnode
        method: 1.Deploy milvus and limit indexnode memory resource 1Gi
                2.Create collection and insert some data
                3.Create index
                4.Inject memory stress chaos 512Mi
        expected:
        """
        # init collection and insert 250 nb
        nb = 50000  # vector size: 512*4*nb about 100Mi and create index need 600Mi memory
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 128}}

        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(name=c_name,
                                     schema=cf.gen_default_collection_schema(dim=dim), shards_num=1)

        # insert 256000 512 dim entities, size 512Mi
        for i in range(2):
            t0_insert = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb // 2
            # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}')
            tt_insert = datetime.datetime.now() - t0_insert
            log.info(f"{i} insert data cost: {tt_insert}")

        # flush
        t0_flush = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt_flush = datetime.datetime.now() - t0_flush
        log.info(f'flush {nb * 10} entities cost: {tt_flush}')

        # create index
        t0_index = datetime.datetime.now()
        index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name,
                                             index_params=index_params)
        tt_index = datetime.datetime.now() - t0_index

        log.info(f"create index cost: {tt_index}")
        log.info(collection_w.indexes)

        # indexNode start build index, inject chaos memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("inject chaos")

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
    def test_chaos_memory_stress_etcd(self, chaos_yaml):
        mic_checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        # start thread keep running milvus op
        start_monitor_threads(mic_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        # duration = chaos_config["spec"]["duration"]
        meta_name = chaos_config.get('metadata').get('name')
        duration = chaos_config.get('spec').get('duration')

        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("Chaos injected")

        # convert string duration time to a int number in seconds
        if isinstance(duration, str):
            duration = duration.replace('h', '*3600+')
            duration = duration.replace('m', '*60+')
            duration = duration.replace('s', '*1')
        else:
            log.error("Duration must be string type")

        # Delete experiment after it's over
        timer = threading.Timer(interval=eval(duration), function=chaos_res.delete, args=(meta_name, False))
        timer.start()
        timer.join()

        # output milvus op succ rate
        for k, ch in mic_checkers.items():
            log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}')
コード例 #3
0
class TestChaosData:
    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        connections.connect(alias='default')
        if connections.has_connection("default") is False:
            raise Exception("no connections")

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_data_consist(self, connection, chaos_yaml):
        """
        target: verify data consistence after chaos injected and recovered
        method: 1. create a collection, insert some data, search and query
                2. inject a chaos object
                3. reconnect to service
                4. verify a) data entities persists, index persists,
                          b) search and query results persist
        expected: collection data and results persist
        """
        c_name = cf.gen_unique_str('chaos_collection_')
        nb = 5000
        i_name = cf.gen_unique_str('chaos_index_')
        index_params = {
            "index_type": "IVF_SQ8",
            "metric_type": "L2",
            "params": {
                "nlist": 64
            }
        }

        # create
        t0 = datetime.datetime.now()
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(name=c_name,
                                     schema=cf.gen_default_collection_schema())
        tt = datetime.datetime.now() - t0
        log.info(f"assert create: {tt}")
        assert collection_w.name == c_name

        # insert
        data = cf.gen_default_list_data(nb=nb)
        t0 = datetime.datetime.now()
        _, res = collection_w.insert(data)
        tt = datetime.datetime.now() - t0
        log.info(f"assert insert: {tt}")
        assert res

        # flush
        t0 = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt = datetime.datetime.now() - t0
        log.info(f"assert flush: {tt}")

        # search
        collection_w.load()
        search_vectors = cf.gen_vectors(1, ct.default_dim)
        t0 = datetime.datetime.now()
        search_params = {"metric_type": "L2", "params": {"nprobe": 16}}
        search_res, _ = collection_w.search(
            data=search_vectors,
            anns_field=ct.default_float_vec_field_name,
            param=search_params,
            limit=1)
        tt = datetime.datetime.now() - t0
        log.info(f"assert search: {tt}")
        assert len(search_res) == 1

        # index
        t0 = datetime.datetime.now()
        index, _ = collection_w.create_index(
            field_name=ct.default_float_vec_field_name,
            index_params=index_params,
            name=i_name)
        tt = datetime.datetime.now() - t0
        log.info(f"assert index: {tt}")
        assert len(collection_w.indexes) == 1

        # query
        term_expr = f'{ct.default_int64_field_name} in [1001,1201,999,99]'
        t0 = datetime.datetime.now()
        query_res, _ = collection_w.query(term_expr)
        tt = datetime.datetime.now() - t0
        log.info(f"assert query: {tt}")
        assert len(query_res) == 4

        # reboot a pod
        reboot_pod(chaos_yaml)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)

        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE,
                        f"app.kubernetes.io/instance={meta_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 3)
        reconnect(connections, alias='default')

        # verify collection persists
        assert utility.has_collection(c_name)
        log.info("assert collection persists")
        collection_w2 = ApiCollectionWrapper()
        collection_w2.init_collection(c_name)
        # verify data persist
        assert collection_w2.num_entities == nb
        log.info("assert data persists")
        # verify index persists
        assert collection_w2.has_index(i_name)
        log.info("assert index persists")
        # verify search results persist
        collection_w2.load()
        search_res, _ = collection_w.search(
            data=search_vectors,
            anns_field=ct.default_float_vec_field_name,
            param=search_params,
            limit=1)
        tt = datetime.datetime.now() - t0
        log.info(f"assert search: {tt}")
        assert len(search_res) == 1
        # verify query results persist
        query_res2, _ = collection_w2.query(term_expr)
        assert len(query_res2) == len(query_res)
        log.info("assert query result persists")
コード例 #4
0
class TestChaosData:
    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        conn = connections.connect(alias='default')
        if conn is None:
            raise Exception("no connections")
        return conn

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_querynode(self, connection, chaos_yaml):
        """
        target: explore query node behavior after memory stress chaos injected and recovered
        method: 1. Create a collection, insert some data
                2. Inject memory stress chaos
                3. Start a threas to load, search and query
                4. After chaos duration, check query search success rate
                5. Delete chaos or chaos finished finally
        expected: 1.If memory is insufficient, querynode is OOMKilled and available after restart
                  2.If memory is sufficient, succ rate of query and search both are 1.0
        """
        c_name = 'chaos_memory_nx6DNW4q'
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(c_name)
        log.debug(collection_w.schema)
        log.debug(collection_w._shards_num)

        # apply memory stress chaos
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace(
            'm', '*60+').replace('s', '*1+') + '+0'
        meta_name = chaos_config.get('metadata').get('name')
        # wait memory stress
        sleep(constants.WAIT_PER_OP * 2)

        # try to do release, load, query and serach in a duration time loop
        try:
            start = time.time()
            while time.time() - start < eval(duration):
                collection_w.release()
                collection_w.load()

                term_expr = f'{ct.default_int64_field_name} in {[random.randint(0, 100)]}'
                query_res, _ = collection_w.query(term_expr)
                assert len(query_res) == 1

                search_res, _ = collection_w.search(
                    cf.gen_vectors(1, ct.default_dim),
                    ct.default_float_vec_field_name, ct.default_search_params,
                    ct.default_limit)
                log.debug(search_res[0].ids)
                assert len(search_res[0].ids) == ct.default_limit

        except Exception as e:
            raise Exception(str(e))

        finally:
            chaos_res.delete(meta_name)

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_datanode(self, chaos_yaml):
        """
        target: test inject memory stress into dataNode
        method: 1.Deploy milvus and limit datanode memory resource
                2.Create collection and insert some data
                3.Inject memory stress chaos
                4.Continue to insert data
        expected:
        """
        # init collection and insert 250 nb
        nb = 25000
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(
            name=c_name, schema=cf.gen_default_collection_schema(dim=dim))
        for i in range(10):
            t0 = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb
            log.info(
                f'After {i + 1} insert, num_entities: {collection_w.num_entities}'
            )
            tt = datetime.datetime.now() - t0
            log.info(f"{i} insert and flush data cost: {tt}")

        # inject memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")

        # Continue to insert data
        collection_w.insert(df)
        log.info(f'Total num entities: {collection_w.num_entities}')

        # delete chaos
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(metadata_name=meta_name)

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml):
        """
        target: test inject memory stress into indexnode
        method: 1.Deploy milvus and limit indexnode memory resource 3 / 4Gi
                2.Create collection and insert some data
                3.Inject memory stress chaos 512Mi
                4.Create index
        expected:
        """
        # init collection and insert
        nb = 256000  # vector size: 512*4*nb about 512Mi and create index need 2.8Gi memory
        dim = 512
        # c_name = cf.gen_unique_str('chaos_memory')
        c_name = 'chaos_memory_gKs8aSUu'
        index_params = {
            "index_type": "IVF_SQ8",
            "metric_type": "L2",
            "params": {
                "nlist": 128
            }
        }

        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(
            name=c_name,
            schema=cf.gen_default_collection_schema(dim=dim),
            shards_num=1)

        # insert 256000 512 dim entities, size 512Mi
        for i in range(2):
            t0_insert = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb // 2
            # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}')
            tt_insert = datetime.datetime.now() - t0_insert
            log.info(f"{i} insert data cost: {tt_insert}")

        # flush
        t0_flush = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt_flush = datetime.datetime.now() - t0_flush
        log.info(f'flush {nb * 10} entities cost: {tt_flush}')

        log.info(collection_w.indexes[0].params)
        if collection_w.has_index()[0]:
            collection_w.drop_index()

        # indexNode start build index, inject chaos memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("inject chaos")

        # create index
        t0_index = datetime.datetime.now()
        index, _ = collection_w.create_index(
            field_name=ct.default_float_vec_field_name,
            index_params=index_params)
        tt_index = datetime.datetime.now() - t0_index

        log.info(f"create index cost: {tt_index}")
        log.info(collection_w.indexes[0].params)

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
    def test_chaos_memory_stress_etcd(self, chaos_yaml):
        """
        target: test inject memory stress into all etcd pods
        method: 1.Deploy milvus and limit etcd memory resource 1Gi witl all mode
                2.Continuously and concurrently do milvus operations
                3.Inject memory stress chaos 51024Mi
                4.After duration, delete chaos stress
        expected: Verify milvus operation succ rate
        """
        mic_checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        # start thread keep running milvus op
        start_monitor_threads(mic_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        # duration = chaos_config["spec"]["duration"]
        meta_name = chaos_config.get('metadata').get('name')
        duration = chaos_config.get('spec').get('duration')

        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("Chaos injected")

        # convert string duration time to an int number in seconds
        if isinstance(duration, str):
            duration = duration.replace('h', '*3600+').replace(
                'm', '*60+').replace('s', '*1+') + '+0'
        else:
            log.error("Duration must be string type")

        # Delete experiment after it's over
        timer = threading.Timer(interval=eval(duration),
                                function=chaos_res.delete,
                                args=(meta_name, False))
        timer.start()
        timer.join()

        # output milvus op succ rate
        for k, ch in mic_checkers.items():
            log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}')
            assert ch.succ_rate() == 1.0
コード例 #5
0
class TestChaosData:
    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        conn = connections.connect(alias='default')
        if conn is None:
            raise Exception("no connections")
        return conn

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_querynode(self, connection, chaos_yaml):
        """
        target: explore querynode behavior after memory stress chaos injected and recovered
        method: 1. create a collection, insert some data
                2. inject memory stress chaos
                3. load collection and search, query
                4. todo (verify querynode response)
                5. delete chaos or chaos finished
                6. release and reload collection, verify search and query is available
        expected: after chaos deleted, load, search and query qre both available
        """
        c_name = cf.gen_unique_str('chaos_memory')
        collection_w = construct_from_data(c_name)
        log.debug(collection_w.schema)

        # apply memory stress
        apply_memory_stress(chaos_yaml)

        # wait memory stress
        sleep(constants.WAIT_PER_OP * 2)

        # query
        collection_w.release()
        collection_w.load()
        term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]'
        t0 = datetime.datetime.now()
        query_res, _ = collection_w.query(term_expr)
        tt = datetime.datetime.now() - t0
        log.info(f"assert query: {tt}")
        assert len(query_res) == 4

        sleep(constants.WAIT_PER_OP * 5)

        # query
        collection_w.release()
        collection_w.load()
        term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]'
        t0 = datetime.datetime.now()
        query_res, _ = collection_w.query(term_expr)
        tt = datetime.datetime.now() - t0
        log.info(f"assert query: {tt}")
        assert len(query_res) == 4

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_datanode(self, chaos_yaml):
        """
        target: test inject memory stress into dataNode
        method: 1.Deploy milvus and limit datanode memory resource
                2.Create collection and insert some data
                3.Inject memory stress chaos
                4.Continue to insert data
        expected:
        """
        # init collection and insert 250 nb
        nb = 25000
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(
            name=c_name, schema=cf.gen_default_collection_schema(dim=dim))
        for i in range(10):
            t0 = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb
            log.info(
                f'After {i+1} insert, num_entities: {collection_w.num_entities}'
            )
            tt = datetime.datetime.now() - t0
            log.info(f"{i} insert and flush data cost: {tt}")

        # inject memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")

        # Continue to insert data
        collection_w.insert(df)
        log.info(f'Total num entities: {collection_w.num_entities}')

        # delete chaos
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(metadata_name=meta_name)

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
    def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml):
        """
        target: test inject memory stress into indexnode
        method: 1.Deploy milvus and limit indexnode memory resource 1Gi
                2.Create collection and insert some data
                3.Create index
                4.Inject memory stress chaos 512Mi
        expected:
        """
        # init collection and insert 250 nb
        nb = 50000  # vector size: 512*4*nb about 100Mi and create index need 600Mi memory
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        index_params = {
            "index_type": "IVF_SQ8",
            "metric_type": "L2",
            "params": {
                "nlist": 128
            }
        }

        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(
            name=c_name,
            schema=cf.gen_default_collection_schema(dim=dim),
            shards_num=1)

        # insert 256000 512 dim entities 512Mi
        for i in range(2):
            t0_insert = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb // 2
            # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}')
            tt_insert = datetime.datetime.now() - t0_insert
            log.info(f"{i} insert data cost: {tt_insert}")

        # flush
        t0_flush = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt_flush = datetime.datetime.now() - t0_flush
        log.info(f'flush {nb * 10} entities cost: {tt_flush}')

        # create index
        # index
        t0_index = datetime.datetime.now()
        index, _ = collection_w.create_index(
            field_name=ct.default_float_vec_field_name,
            index_params=index_params)
        tt_index = datetime.datetime.now() - t0_index

        log.info(f"create index cost: {tt_index}")
        log.info(collection_w.indexes)

        # indexNode start build index, inject chaos memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("inject chaos")
コード例 #6
0
ファイル: test_chaos.py プロジェクト: symphony233/milvus
class TestChaos(TestChaosBase):
    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        conn = connections.connect(alias='default')
        if conn is None:
            raise Exception("no connections")
        self.host = host
        self.port = port
        return conn

    @pytest.fixture(scope="function", autouse=True)
    def init_health_checkers(self, connection):
        checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        self.health_checkers = checkers

    def teardown(self):
        chaos_res = CusResource(kind=self._chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        meta_name = self._chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(meta_name, raise_ex=False)
        for k, ch in self.health_checkers.items():
            ch.terminate()
            log.info(f"tear down: checker {k} terminated")
        sleep(2)
        for k, t in self.checker_threads.items():
            log.info(f"Thread {k} is_alive(): {t.is_alive()}")

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        self.checker_threads = cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")

        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False
        # init report
        meta_name = chaos_config.get('metadata', None).get('name', None)
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("1st assert before chaos: ")
            f.write(f"{self.health_checkers}\n")
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2.1)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        for k, t in self.checker_threads.items():
            log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}")

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.create: self.expect_create,
                             Op.insert: self.expect_insert,
                             Op.flush: self.expect_flush,
                             Op.index: self.expect_index,
                             Op.search: self.expect_search,
                             Op.query: self.expect_query
                         })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:")
            f.write(f"{self.health_checkers}\n")
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        for k, t in self.checker_threads.items():
            log.info(f"Thread {k} is_alive(): {t.is_alive()}")
        sleep(2)

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')

        # reset counting again
        cc.reset_counting(self.health_checkers)

        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)

        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:")
            f.write(f"{self.health_checkers}\n")
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")