def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) release_name = meta_name chaos_config_str = json.dumps(chaos_config) chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name) chaos_config = json.loads(chaos_config_str) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml, chaos_config) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: ts = time.strftime("%Y-%m-%d %H:%M:%S") f.write(f"{meta_name}-{ts}\n") f.write("1st assert before chaos:\n") f.write(record_results(self.health_checkers)) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:\n") f.write(record_results(self.health_checkers)) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") log.info(f'Alive threads: {threading.enumerate()}') sleep(2) # wait all pods ready log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:\n") f.write(record_results(self.health_checkers)) # assert all expectations assert_expectations() log.info("*********************Chaos Test Completed**********************")
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) self.checker_threads = cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report meta_name = chaos_config.get('metadata', None).get('name', None) dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("1st assert before chaos: ") f.write(f"{self.health_checkers}\n") # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2.1) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) for k, t in self.checker_threads.items(): log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:") f.write(f"{self.health_checkers}\n") # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:") f.write(f"{self.health_checkers}\n") # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos_data_consist(self, connection, chaos_yaml): """ target: verify data consistence after chaos injected and recovered method: 1. create a collection, insert some data, search and query 2. inject a chaos object 3. reconnect to service 4. verify a) data entities persists, index persists, b) search and query results persist expected: collection data and results persist """ c_name = cf.gen_unique_str('chaos_collection_') nb = 5000 i_name = cf.gen_unique_str('chaos_index_') index_params = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } # create t0 = datetime.datetime.now() collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema()) tt = datetime.datetime.now() - t0 log.info(f"assert create: {tt}") assert collection_w.name == c_name # insert data = cf.gen_default_list_data(nb=nb) t0 = datetime.datetime.now() _, res = collection_w.insert(data) tt = datetime.datetime.now() - t0 log.info(f"assert insert: {tt}") assert res # flush t0 = datetime.datetime.now() assert collection_w.num_entities == nb tt = datetime.datetime.now() - t0 log.info(f"assert flush: {tt}") # search collection_w.load() search_vectors = cf.gen_vectors(1, ct.default_dim) t0 = datetime.datetime.now() search_params = {"metric_type": "L2", "params": {"nprobe": 16}} search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # index t0 = datetime.datetime.now() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=index_params, name=i_name) tt = datetime.datetime.now() - t0 log.info(f"assert index: {tt}") assert len(collection_w.indexes) == 1 # query term_expr = f'{ct.default_int64_field_name} in [1001,1201,999,99]' t0 = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt = datetime.datetime.now() - t0 log.info(f"assert query: {tt}") assert len(query_res) == 4 # reboot a pod reboot_pod(chaos_yaml) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 3) reconnect(connections, alias='default') # verify collection persists assert utility.has_collection(c_name) log.info("assert collection persists") collection_w2 = ApiCollectionWrapper() collection_w2.init_collection(c_name) # verify data persist assert collection_w2.num_entities == nb log.info("assert data persists") # verify index persists assert collection_w2.has_index(i_name) log.info("assert index persists") # verify search results persist collection_w2.load() search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # verify query results persist query_res2, _ = collection_w2.query(term_expr) assert len(query_res2) == len(query_res) log.info("assert query result persists")
def test_bulk_load(self, chaos_type, target_component): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) release_name = self.instance_name cc.start_monitor_threads(self.health_checkers) chaos_config = cc.gen_experiment_config( f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml" ) chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}" kind = chaos_config['kind'] meta_name = chaos_config.get('metadata', None).get('name', None) update_key_value(chaos_config, "release", release_name) update_key_value(chaos_config, "app.kubernetes.io/instance", release_name) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # wait 20s sleep(constants.WAIT_PER_OP * 10) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") sleep(constants.WAIT_PER_OP * 10) # reset counting cc.reset_counting(self.health_checkers) # wait 120s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.bulk_load: constants.FAIL, }) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") sleep(2) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # recheck failed tasks in third assert self.health_checkers[Op.bulk_load].recheck_failed_task = True # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 10) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")