Ejemplo n.º 1
0
def export_pod_logs(namespace, label_selector, release_name=None):
    """
    export pod logs with label selector to '/tmp/milvus'

    :param namespace: the namespace where the release
    :type namespace: str

    :param label_selector: labels to restrict which pods logs to export
    :type label_selector: str

    :param release_name: use the release name as server logs director name
    :type label_selector: str

    :example:
            >>> export_pod_logs("chaos-testing", "app.kubernetes.io/instance=mic-milvus")
    """
    if isinstance(release_name, str):
        if len(release_name.strip()) == 0:
            raise ValueError("Got an unexpected space release_name")
    else:
        raise TypeError("Got an unexpected non-string release_name")
    pod_log_path = '/tmp/milvus_logs' if release_name is None else f'/tmp/milvus_logs/{release_name}'

    if not os.path.isdir(pod_log_path):
        os.makedirs(pod_log_path)

    # get pods and export logs
    items = get_pod_list(namespace, label_selector=label_selector)
    try:
        for item in items:
            pod_name = item.metadata.name
            os.system(
                f'kubectl logs {pod_name} > {pod_log_path}/{pod_name}.log 2>&1'
            )
    except Exception as e:
        log.error(f"Exception when export pod {pod_name} logs: %s\n" % e)
        raise Exception(str(e))
Ejemplo n.º 2
0
    def _update_configs(configs, template=None):
        """
        Method: update the template with customized configs
        Params:
            configs: a dict type of configurations that describe the properties of milvus to be deployed
            template: Optional. Pass the template file location if there is a template to apply
        Return: a dict type customized configs
        """
        if not isinstance(configs, dict):
            log.error("customize configurations must be in dict type")
            return None

        if template is None:
            d_configs = benedict()
            d_configs['apiVersion'] = f'{MILVUS_GRP}/{MILVUS_VER}'
            d_configs['kind'] = MILVUS_KIND
        else:
            d_configs = benedict.from_yaml(template)

        for key in configs.keys():
            d_configs[key] = configs[key]

        # return a python dict if it is not none
        return d_configs._dict if d_configs._dict is not None else d_configs
Ejemplo n.º 3
0
    def check_query_results(query_res, func_name, check_items):
        """
        According to the check_items to check actual query result, which return from func_name.

        :param: query_res: A list that contains all results
        :type: list

        :param func_name: Query API name
        :type func_name: str

        :param check_items: The items expected to be checked, including exp_res, with_vec
                            The type of exp_res value is as same as query_res
                            The type of with_vec value is bool, True value means check vector field, False otherwise
        :type check_items: dict
        """
        if func_name != 'query':
            log.warning("The function name is {} rather than {}".format(
                func_name, "query"))
        if not isinstance(query_res, list):
            raise Exception("The query result to check isn't list type object")
        if len(check_items) == 0:
            raise Exception("No expect values found in the check task")
        exp_res = check_items.get("exp_res", None)
        with_vec = check_items.get("with_vec", False)
        primary_field = check_items.get("primary_field", None)
        if exp_res is not None:
            if isinstance(query_res, list):
                assert pc.equal_entities_list(exp=exp_res,
                                              actual=query_res,
                                              primary_field=primary_field,
                                              with_vec=with_vec)
                return True
            else:
                log.error(f"Query result {query_res} is not list")
                return False
        log.warning(f'Expected query result is {exp_res}')
Ejemplo n.º 4
0
def read_pod_log(namespace, label_selector, release_name):
    init_k8s_client_config()
    items = get_pod_list(namespace, label_selector=label_selector)

    try:
        # export log to /tmp/release_name path
        pod_log_path = f'/tmp/milvus_logs/{release_name}'
        if not os.path.isdir(pod_log_path):
            os.makedirs(pod_log_path)

        api_instance = client.CoreV1Api()

        for item in items:
            pod = item.metadata.name
            log.debug(f'Start to read {pod} log')
            logs = api_instance.read_namespaced_pod_log(name=pod,
                                                        namespace=namespace,
                                                        async_req=True)
            with open(f'{pod_log_path}/{pod}.log', "w") as f:
                f.write(logs.get())

    except ApiException as e:
        log.error(f"Exception when read pod {pod} logs: %s\n" % e)
        raise Exception(str(e))
Ejemplo n.º 5
0
 def check_search_results(search_res, func_name, check_items):
     """
     target: check the search results
     method: 1. check the query number
             2. check the limit(topK) and ids
             3. check the distance
     expected: check the search is ok
     """
     log.info("search_results_check: checking the searching results")
     if func_name != 'search':
         log.warning("The function name is {} rather than {}".format(
             func_name, "search"))
     if len(check_items) == 0:
         raise Exception("No expect values found in the check task")
     if check_items.get("_async", None):
         if check_items["_async"]:
             search_res.done()
             search_res = search_res.result()
     if len(search_res) != check_items["nq"]:
         log.error("search_results_check: Numbers of query searched (%d) "
                   "is not equal with expected (%d)" %
                   (len(search_res), check_items["nq"]))
         assert len(search_res) == check_items["nq"]
     else:
         log.info(
             "search_results_check: Numbers of query searched is correct")
     for hits in search_res:
         if (len(hits) != check_items["limit"]) \
                 or (len(hits.ids) != check_items["limit"]):
             log.error("search_results_check: limit(topK) searched (%d) "
                       "is not equal with expected (%d)" %
                       (len(hits), check_items["limit"]))
             assert len(hits) == check_items["limit"]
             assert len(hits.ids) == check_items["limit"]
         else:
             if check_items.get("ids", None) is not None:
                 ids_match = pc.list_contain_check(list(hits.ids),
                                                   list(check_items["ids"]))
                 if not ids_match:
                     log.error(
                         "search_results_check: ids searched not match")
                     assert ids_match
             else:
                 pass  # just check nq and topk, not specific ids need check
     log.info("search_results_check: limit (topK) and "
              "ids searched for %d queries are correct" % len(search_res))
     return True
Ejemplo n.º 6
0
def ip_check(ip):
    if ip == "localhost":
        return True

    if not isinstance(ip, str):
        log.error("[IP_CHECK] IP(%s) is not a string." % ip)
        return False

    _list = ip.split('.')
    if len(_list) != 4:
        log.error("[IP_CHECK] IP(%s) is wrong, please check manually." % ip)
        return False

    for i in _list:
        if not str(i).isdigit():
            log.error("[IP_CHECK] IP(%s) is wrong, please check manually." % ip)
            return False

    return True
Ejemplo n.º 7
0
def check_content(request):
    log.error("^" * 50)
    log.error("check_content")
    return request.config.getoption("--check_content")
Ejemplo n.º 8
0
def restart_server(helm_release_name):
    res = True
    timeout = 120
    from kubernetes import client, config
    client.rest.logger.setLevel(log.WARNING)

    # service_name = "%s.%s.svc.cluster.local" % (helm_release_name, namespace)
    config.load_kube_config()
    v1 = client.CoreV1Api()
    pod_name = None
    # config_map_names = v1.list_namespaced_config_map(namespace, pretty='true')
    # body = {"replicas": 0}
    pods = v1.list_namespaced_pod(namespace)
    for i in pods.items:
        if i.metadata.name.find(
                helm_release_name) != -1 and i.metadata.name.find(
                    "mysql") == -1:
            pod_name = i.metadata.name
            break
            # v1.patch_namespaced_config_map(config_map_name, namespace, body, pretty='true')
    # status_res = v1.read_namespaced_service_status(helm_release_name, namespace, pretty='true')
    log.debug("Pod name: %s" % pod_name)
    if pod_name is not None:
        try:
            v1.delete_namespaced_pod(pod_name, namespace)
        except Exception as e:
            log.error(str(e))
            log.error(
                "Exception when calling CoreV1Api->delete_namespaced_pod")
            res = False
            return res
        log.error("Sleep 10s after pod deleted")
        time.sleep(10)
        # check if restart successfully
        pods = v1.list_namespaced_pod(namespace)
        for i in pods.items:
            pod_name_tmp = i.metadata.name
            log.error(pod_name_tmp)
            if pod_name_tmp == pod_name:
                continue
            elif pod_name_tmp.find(helm_release_name
                                   ) == -1 or pod_name_tmp.find("mysql") != -1:
                continue
            else:
                status_res = v1.read_namespaced_pod_status(pod_name_tmp,
                                                           namespace,
                                                           pretty='true')
                log.error(status_res.status.phase)
                start_time = time.time()
                ready_break = False
                while time.time() - start_time <= timeout:
                    log.error(time.time())
                    status_res = v1.read_namespaced_pod_status(pod_name_tmp,
                                                               namespace,
                                                               pretty='true')
                    if status_res.status.phase == "Running":
                        log.error("Already running")
                        ready_break = True
                        time.sleep(10)
                        break
                    else:
                        time.sleep(1)
                if time.time() - start_time > timeout:
                    log.error("Restart pod: %s timeout" % pod_name_tmp)
                    res = False
                    return res
                if ready_break:
                    break
    else:
        raise Exception("Pod: %s not found" % pod_name)
    follow = True
    pretty = True
    previous = True  # bool | Return previous terminated container logs. Defaults to false. (optional)
    since_seconds = 56  # int | A relative time in seconds before the current time from which to show logs. If this value precedes the time a pod was started, only logs since the pod start will be returned. If this value is in the future, no logs will be returned. Only one of sinceSeconds or sinceTime may be specified. (optional)
    timestamps = True  # bool | If true, add an RFC3339 or RFC3339Nano timestamp at the beginning of every line of log output. Defaults to false. (optional)
    container = "milvus"
    # start_time = time.time()
    # while time.time() - start_time <= timeout:
    #     try:
    #         api_response = v1.read_namespaced_pod_log(pod_name_tmp, namespace, container=container, follow=follow,
    #                                                 pretty=pretty, previous=previous, since_seconds=since_seconds,
    #                                                 timestamps=timestamps)
    #         log.error(api_response)
    #         return res
    #     except Exception as e:
    #         log.error("Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n" % e)
    #         # waiting for server start
    #         time.sleep(5)
    #         # res = False
    #         # return res
    # if time.time() - start_time > timeout:
    #     log.error("Restart pod: %s timeout" % pod_name_tmp)
    #     res = False
    return res
Ejemplo n.º 9
0
    def test_scale_data_node(self):
        """
        target: test scale dataNode
        method: 1.deploy milvus cluster with 2 dataNode
                2.create collection with shards_num=5
                3.continuously insert new data (daemon thread)
                4.expand dataNode from 2 to 5
                5.create new collection with shards_num=2
                6.continuously insert new collection new data (daemon thread)
                7.shrink dataNode from 5 to 3
        expected: Verify milvus remains healthy, Insert and flush successfully during scale
                  Average dataNode memory usage
        """
        release_name = "scale-data"
        image_tag = get_latest_tag()
        image = f'{constants.IMAGE_REPOSITORY}:{image_tag}'
        fail_count = 0

        data_config = {
            'metadata.namespace': constants.NAMESPACE,
            'metadata.name': release_name,
            'spec.components.image': image,
            'spec.components.proxy.serviceType': 'LoadBalancer',
            'spec.components.dataNode.replicas': 2,
            'spec.config.dataCoord.enableCompaction': True,
            'spec.config.dataCoord.enableGarbageCollection': True
        }
        mic = MilvusOperator()
        mic.install(data_config)
        if mic.wait_for_healthy(release_name,
                                constants.NAMESPACE,
                                timeout=1200):
            host = mic.endpoint(release_name,
                                constants.NAMESPACE).split(':')[0]
        else:
            # log.warning(f'Deploy {release_name} timeout and ready to uninstall')
            # mic.uninstall(release_name, namespace=constants.NAMESPACE)
            raise BaseException(f'Milvus healthy timeout 1200s')

        try:
            # connect
            connections.add_connection(default={"host": host, "port": 19530})
            connections.connect(alias='default')

            # create
            c_name = cf.gen_unique_str("scale_query")
            # c_name = 'scale_query_DymS7kI4'
            collection_w = ApiCollectionWrapper()
            collection_w.init_collection(
                name=c_name,
                schema=cf.gen_default_collection_schema(),
                shards_num=5)

            tmp_nb = 10000

            def do_insert():
                while True:
                    tmp_df = cf.gen_default_dataframe_data(tmp_nb)
                    collection_w.insert(tmp_df)
                    log.debug(collection_w.num_entities)

            t_insert = threading.Thread(target=do_insert, args=(), daemon=True)
            t_insert.start()

            # scale dataNode to 5
            mic.upgrade(release_name, {'spec.components.dataNode.replicas': 5},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")
            log.debug("Expand dataNode test finished")

            # create new collection and insert
            new_c_name = cf.gen_unique_str("scale_query")
            collection_w_new = ApiCollectionWrapper()
            collection_w_new.init_collection(
                name=new_c_name,
                schema=cf.gen_default_collection_schema(),
                shards_num=2)

            def do_new_insert():
                while True:
                    tmp_df = cf.gen_default_dataframe_data(tmp_nb)
                    collection_w_new.insert(tmp_df)
                    log.debug(collection_w_new.num_entities)

            t_insert_new = threading.Thread(target=do_new_insert,
                                            args=(),
                                            daemon=True)
            t_insert_new.start()

            # scale dataNode to 3
            mic.upgrade(release_name, {'spec.components.dataNode.replicas': 3},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")

            log.debug(collection_w.num_entities)
            time.sleep(300)
            log.debug("Shrink dataNode test finished")

        except Exception as e:
            log.error(str(e))
            fail_count += 1
            # raise Exception(str(e))

        finally:
            log.info(f'Test finished with {fail_count} fail request')
            assert fail_count <= 1
            label = f"app.kubernetes.io/instance={release_name}"
            log.info('Start to export milvus pod logs')
            read_pod_log(namespace=constants.NAMESPACE,
                         label_selector=label,
                         release_name=release_name)

            mic.uninstall(release_name, namespace=constants.NAMESPACE)
Ejemplo n.º 10
0
    def test_scale_query_node(self):
        """
        target: test scale queryNode
        method: 1.deploy milvus cluster with 1 queryNode
                2.prepare work (connect, create, insert, index and load)
                3.continuously search (daemon thread)
                4.expand queryNode from 2 to 5
                5.continuously insert new data (daemon thread)
                6.shrink queryNode from 5 to 3
        expected: Verify milvus remains healthy and search successfully during scale
        """
        fail_count = 0
        release_name = "scale-query"
        image_tag = get_latest_tag()
        image = f'{constants.IMAGE_REPOSITORY}:{image_tag}'
        query_config = {
            'metadata.namespace': constants.NAMESPACE,
            'metadata.name': release_name,
            'spec.components.image': image,
            'spec.components.proxy.serviceType': 'LoadBalancer',
            'spec.components.queryNode.replicas': 1,
            'spec.config.dataCoord.enableCompaction': True,
            'spec.config.dataCoord.enableGarbageCollection': True
        }
        mic = MilvusOperator()
        mic.install(query_config)
        if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1200):
            host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0]
        else:
            # log.warning(f'Deploy {release_name} timeout and ready to uninstall')
            # mic.uninstall(release_name, namespace=constants.NAMESPACE)
            raise BaseException(f'Milvus healthy timeout 1200s')

        try:
            # connect
            connections.add_connection(default={"host": host, "port": 19530})
            connections.connect(alias='default')

            # create
            c_name = cf.gen_unique_str("scale_query")
            # c_name = 'scale_query_DymS7kI4'
            collection_w = ApiCollectionWrapper()
            collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema(), shards_num=2)

            # insert two segments
            for i in range(3):
                df = cf.gen_default_dataframe_data(nb)
                collection_w.insert(df)
                log.debug(collection_w.num_entities)

            # create index
            collection_w.create_index(ct.default_float_vec_field_name, default_index_params)
            assert collection_w.has_index()[0]
            assert collection_w.index()[0] == Index(collection_w.collection, ct.default_float_vec_field_name,
                                                    default_index_params)

            # load
            collection_w.load()

            # scale queryNode to 5
            mic.upgrade(release_name, {'spec.components.queryNode.replicas': 5}, constants.NAMESPACE)

            # continuously search
            def do_search():
                while True:
                    search_res, _ = collection_w.search(cf.gen_vectors(1, ct.default_dim),
                                                        ct.default_float_vec_field_name,
                                                        ct.default_search_params, ct.default_limit)
                    log.debug(search_res[0].ids)
                    assert len(search_res[0].ids) == ct.default_limit

            t_search = threading.Thread(target=do_search, args=(), daemon=True)
            t_search.start()

            # wait new QN running, continuously insert
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}")

            def do_insert():
                while True:
                    tmp_df = cf.gen_default_dataframe_data(1000)
                    collection_w.insert(tmp_df)

            t_insert = threading.Thread(target=do_insert, args=(), daemon=True)
            t_insert.start()

            log.debug(collection_w.num_entities)
            time.sleep(20)
            log.debug("Expand querynode test finished")

            mic.upgrade(release_name, {'spec.components.queryNode.replicas': 3}, constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}")

            log.debug(collection_w.num_entities)
            time.sleep(60)
            log.debug("Shrink querynode test finished")

        except Exception as e:
            log.error(str(e))
            fail_count += 1
            # raise Exception(str(e))

        finally:
            log.info(f'Test finished with {fail_count} fail request')
            assert fail_count <= 1
            label = f"app.kubernetes.io/instance={release_name}"
            log.info('Start to export milvus pod logs')
            read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name)
            mic.uninstall(release_name, namespace=constants.NAMESPACE)
Ejemplo n.º 11
0
    def test_scale_proxy(self):
        """
        target: test milvus operation after proxy expand
        method: 1.deploy 1 proxy replicas
                2.milvus e2e test in parallel
                3.expand proxy pod from 1 to 5
                4.milvus e2e test
                5.shrink proxy from 5 to 2
        expected: 1.verify data consistent and func work
        """
        # deploy milvus cluster with one proxy
        fail_count = 0
        release_name = "scale-proxy"
        image_tag = get_latest_tag()
        image = f'{constants.IMAGE_REPOSITORY}:{image_tag}'
        data_config = {
            'metadata.namespace': constants.NAMESPACE,
            'metadata.name': release_name,
            'spec.mode': 'cluster',
            'spec.components.image': image,
            'spec.components.proxy.serviceType': 'LoadBalancer',
            'spec.components.proxy.replicas': 1,
            'spec.components.dataNode.replicas': 2,
            'spec.config.common.retentionDuration': 60
        }
        mic = MilvusOperator()
        mic.install(data_config)
        if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1800):
            host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0]
        else:
            raise MilvusException(message=f'Milvus healthy timeout 1800s')

        try:
            c_name = cf.gen_unique_str("proxy_scale")
            e2e_milvus_parallel(2, host, c_name)
            log.info('Milvus test before expand')

            # expand proxy replicas from 1 to 5
            mic.upgrade(release_name, {'spec.components.proxy.replicas': 5}, constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}")

            e2e_milvus_parallel(5, host, c_name)
            log.info('Milvus test after expand')

            # expand proxy replicas from 5 to 2
            mic.upgrade(release_name, {'spec.components.proxy.replicas': 2}, constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}")

            e2e_milvus_parallel(2, host, c_name)
            log.info('Milvus test after shrink')

            connections.connect('default', host=host, port=19530)
            collection_w = ApiCollectionWrapper()
            collection_w.init_collection(name=c_name)
            """
            total start 2+5+2 process to run e2e, each time insert default_nb data, But one of the 2 processes started
            for the first time did not insert due to collection creation exception. So actually insert eight times
            """
            assert collection_w.num_entities == 8 * default_nb

        except Exception as e:
            log.error(str(e))
            fail_count += 1
            # raise Exception(str(e))

        finally:
            log.info(f'Test finished with {fail_count} fail request')
            assert fail_count <= 1
            label = f"app.kubernetes.io/instance={release_name}"
            log.info('Start to export milvus pod logs')
            read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name)
            mic.uninstall(release_name, namespace=constants.NAMESPACE)
Ejemplo n.º 12
0
def dict_equal_check(dict1, dict2):
    if not isinstance(dict1, dict) or not isinstance(dict2, dict):
        log.error("[DICT_EQUAL_CHECK] Type of dict(%s) or dict(%s) is not a dict." % (str(dict1), str(dict2)))
        return False
    return operator.eq(dict1, dict2)
Ejemplo n.º 13
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.debug("*********************Chaos Test Start**********************")
        log.debug(connections.get_connection_addr('default'))
        self.checker_threads = start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = gen_experiment_config(chaos_yaml)
        self._chaos_config = chaos_config   # cache the chaos config for tear down
        log.debug(chaos_config)

        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False

        # wait 120s
        sleep(constants.WAIT_PER_OP*2)

        # assert statistic:all ops 100% succ
        log.debug("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)

        # apply chaos object
        chaos_opt = ChaosOpt(chaos_config['kind'])
        chaos_opt.create_chaos_object(chaos_config)
        log.debug("chaos injected")
        sleep(constants.WAIT_PER_OP * 2.1)
        # reset counting
        reset_counting(self.health_checkers)

        # wait 120s
        sleep(constants.WAIT_PER_OP*4)

        for k, t in self.checker_threads.items():
            log.debug(f"10s later: Thread {k} is_alive(): {t.is_alive()}")

        # assert statistic
        log.debug("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={Op.create: self.expect_create,
                                       Op.insert: self.expect_insert,
                                       Op.flush: self.expect_flush,
                                       Op.index: self.expect_index,
                                       Op.search: self.expect_search,
                                       Op.query: self.expect_query
                                       })

        # delete chaos
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_opt.delete_chaos_object(meta_name)
        log.debug("chaos deleted")
        for k, t in self.checker_threads.items():
            log.debug(f"Thread {k} is_alive(): {t.is_alive()}")
        sleep(2)
        # reconnect if needed
        sleep(constants.WAIT_PER_OP*2)
        reconnect(connections, self.host, self.port)

        # reset counting again
        reset_counting(self.health_checkers)

        # wait 300s (varies by feature)
        sleep(constants.WAIT_PER_OP*5)

        # assert statistic: all ops success again
        log.debug("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)

        # assert all expectations
        assert_expectations()

        log.debug("*********************Chaos Test Completed**********************")
Ejemplo n.º 14
0
    def test_scale_proxy(self):
        """
        target: test milvus operation after proxy expand
        method: 1.deploy 1 proxy replicas
                2.milvus e2e test in parallel
                3.expand proxy pod from 1 to 5
                4.milvus e2e test
                5.shrink proxy from 5 to 2
        expected: 1.verify data consistent and func work
        """
        # deploy milvus cluster with one proxy
        fail_count = 0
        release_name = "scale-proxy"
        image_tag = get_latest_tag()
        image = f'{constants.IMAGE_REPOSITORY}:{image_tag}'
        data_config = {
            'metadata.namespace': constants.NAMESPACE,
            'metadata.name': release_name,
            'spec.components.image': image,
            'spec.components.proxy.serviceType': 'LoadBalancer',
            'spec.components.proxy.replicas': 1,
            'spec.components.dataNode.replicas': 2,
            'spec.config.dataCoord.enableCompaction': True,
            'spec.config.dataCoord.enableGarbageCollection': True
        }
        mic = MilvusOperator()
        mic.install(data_config)
        if mic.wait_for_healthy(release_name,
                                constants.NAMESPACE,
                                timeout=1200):
            host = mic.endpoint(release_name,
                                constants.NAMESPACE).split(':')[0]
        else:
            # log.warning(f'Deploy {release_name} timeout and ready to uninstall')
            # mic.uninstall(release_name, namespace=constants.NAMESPACE)
            raise BaseException(f'Milvus healthy timeout 1200s')

        try:
            c_name = cf.gen_unique_str(prefix)
            self.e2e_milvus_parallel(5, host, c_name)
            log.info('Milvus test before expand')

            # expand proxy replicas from 1 to 5
            mic.upgrade(release_name, {'spec.components.proxy.replicas': 5},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")

            self.e2e_milvus_parallel(5, host, c_name)
            log.info('Milvus test after expand')

            # expand proxy replicas from 5 to 2
            mic.upgrade(release_name, {'spec.components.proxy.replicas': 2},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")

            self.e2e_milvus_parallel(2, host, c_name)
            log.info('Milvus test after shrink')

        except Exception as e:
            log.error(str(e))
            fail_count += 1
            # raise Exception(str(e))

        finally:
            log.info(f'Test finished with {fail_count} fail request')
            assert fail_count <= 1
            label = f"app.kubernetes.io/instance={release_name}"
            log.info('Start to export milvus pod logs')
            read_pod_log(namespace=constants.NAMESPACE,
                         label_selector=label,
                         release_name=release_name)
            mic.uninstall(release_name, namespace=constants.NAMESPACE)
Ejemplo n.º 15
0
    def test_task_all(self, index_type, is_compacted,
                      segment_status, is_vector_indexed, is_string_indexed, replica_number, is_deleted, data_size):
        """
        before reinstall: create collection and insert data, load and search
        """
        name = ""
        for k,v in locals().items():
            if k in ["self", "name"]:
                continue
            name += f"_{k}_{v}"
        name = prefix + name
        self._connect()
        ms = MilvusSys()
        if len(ms.query_nodes) < replica_number:
            # this step is to make sure this testcase can run on standalone mode
            # or cluster mode which has only one querynode
            pytest.skip("skip test, not enough nodes")

        log.info(f"collection name: {name}, replica_number: {replica_number}, is_compacted: {is_compacted},"
                 f"is_deleted: {is_deleted}, is_vector_indexed: {is_vector_indexed}, is_string_indexed: {is_string_indexed},"
                 f"segment_status: {segment_status}, index_type: {index_type}")

        is_binary = True if "BIN" in index_type else False

        # params for search and query
        if is_binary:
            _, vectors_to_search = cf.gen_binary_vectors(
                default_nb, default_dim)
            default_search_field = ct.default_binary_vec_field_name
        else:
            vectors_to_search = cf.gen_vectors(default_nb, default_dim)
            default_search_field = ct.default_float_vec_field_name
        search_params = gen_search_param(index_type)[0]

        # init collection and insert with small size data without flush to get growing segment
        collection_w = self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000,
                                                    is_flush=False, is_index=True, name=name)[0]
        # load for growing segment
        if replica_number >= 1:
            try:
                collection_w.release()
            except Exception as e:
                log.error(
                    f"release collection failed: {e} maybe the collection is not loaded")
            collection_w.load(replica_number=replica_number)

        # delete data for growing segment
        delete_expr = f"{ct.default_int64_field_name} in [0,1,2,3,4,5,6,7,8,9]"
        if is_deleted == "is_deleted":
            collection_w.delete(expr=delete_expr)

        # search and query for growing segment
        if replica_number >= 1:
            collection_w.search(vectors_to_search[:default_nq], default_search_field,
                                search_params, default_limit,
                                default_search_exp,
                                check_task=CheckTasks.check_search_results,
                                check_items={"nq": default_nq,
                                            "limit": default_limit})
            output_fields = [ct.default_int64_field_name]
            collection_w.query(default_term_expr, output_fields=output_fields,
                            check_task=CheckTasks.check_query_not_empty)

        # skip subsequent operations when segment_status is set to only_growing
        if segment_status == "only_growing":
            pytest.skip(
                "already get growing segment, skip subsequent operations")

        # insert with flush multiple times to generate multiple sealed segment
        for i in range(2):
            self.init_collection_general(insert_data=True, is_binary=is_binary, nb=data_size,
                                         is_flush=False, is_index=True, name=name)
            collection_w.flush()


        # params for creating index
        if is_binary:
            default_index_field = ct.default_binary_vec_field_name
        else:
            default_index_field = ct.default_float_vec_field_name

        # create index for vector
        if is_vector_indexed == "is_vector_indexed":
            default_index_param = gen_index_param(index_type)
            collection_w.create_index(default_index_field, default_index_param)

        # create index for string
        if is_string_indexed == "is_string_indexed":
            default_string_index_params = {}
            default_string_index_name = "_default_string_idx"
            collection_w.create_index(
                default_string_field_name, default_string_index_params, index_name=default_string_index_name)

        # delete data for sealed segment
        delete_expr = f"{ct.default_int64_field_name} in [10,11,12,13,14,15,16,17,18,19]"
        if is_deleted == "is_deleted":
            collection_w.delete(expr=delete_expr)
        if is_compacted == "is_compacted":
            collection_w.compact()
        if segment_status == "all":
            self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000,
                                         is_flush=False, is_index=True, name=name)
        # reload after flush and creating index
        if replica_number > 0:
            collection_w.release()
            collection_w.load(replica_number=replica_number)

        # insert data to get growing segment
        if segment_status == "all":
            self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000,
                                         is_flush=False, is_index=True, name=name)
        
        # search and query for sealed and growing segment
        if replica_number > 0:
            collection_w.search(vectors_to_search[:default_nq], default_search_field,
                                search_params, default_limit,
                                default_search_exp,
                                check_task=CheckTasks.check_search_results,
                                check_items={"nq": default_nq,
                                            "limit": default_limit})
            output_fields = [ct.default_int64_field_name]
            collection_w.query(default_term_expr, output_fields=output_fields,
                            check_task=CheckTasks.check_query_not_empty)
Ejemplo n.º 16
0
 def inner_wrapper(*args, **kwargs):
     try:
         return func(*args, **kwargs), True
     except Exception as e:
         log.error("[ClientRequest API Exception]%s: %s" % (str(func), str(e)))
         return e, False
Ejemplo n.º 17
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        self.checker_threads = cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")

        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False
        # init report
        meta_name = chaos_config.get('metadata', None).get('name', None)
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("1st assert before chaos: ")
            f.write(f"{self.health_checkers}\n")
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2.1)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        for k, t in self.checker_threads.items():
            log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}")

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.create: self.expect_create,
                             Op.insert: self.expect_insert,
                             Op.flush: self.expect_flush,
                             Op.index: self.expect_index,
                             Op.search: self.expect_search,
                             Op.query: self.expect_query
                         })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:")
            f.write(f"{self.health_checkers}\n")
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        for k, t in self.checker_threads.items():
            log.info(f"Thread {k} is_alive(): {t.is_alive()}")
        sleep(2)

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')

        # reset counting again
        cc.reset_counting(self.health_checkers)

        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)

        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:")
            f.write(f"{self.health_checkers}\n")
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
Ejemplo n.º 18
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)
        release_name = meta_name
        chaos_config_str = json.dumps(chaos_config)
        chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
        chaos_config = json.loads(chaos_config_str)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False

        # init report
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            ts = time.strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"{meta_name}-{ts}\n")
            f.write("1st assert before chaos:\n")
            f.write(record_results(self.health_checkers))
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        log.info(f'Alive threads: {threading.enumerate()}')

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={Op.create: self.expect_create,
                                       Op.insert: self.expect_insert,
                                       Op.flush: self.expect_flush,
                                       Op.index: self.expect_index,
                                       Op.search: self.expect_search,
                                       Op.query: self.expect_query
                                       })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:\n")
            f.write(record_results(self.health_checkers))
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        log.info(f'Alive threads: {threading.enumerate()}')
        sleep(2)
        # wait all pods ready
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:\n")
            f.write(record_results(self.health_checkers))
        # assert all expectations
        assert_expectations()

        log.info("*********************Chaos Test Completed**********************")
Ejemplo n.º 19
0
    def test_scale_data_node(self):
        """
        target: test scale dataNode
        method: 1.deploy milvus cluster with 2 dataNode
                2.create collection with shards_num=5
                3.continuously insert new data (daemon thread)
                4.expand dataNode from 2 to 5
                5.create new collection with shards_num=2
                6.continuously insert new collection new data (daemon thread)
                7.shrink dataNode from 5 to 3
        expected: Verify milvus remains healthy, Insert and flush successfully during scale
                  Average dataNode memory usage
        """
        release_name = "scale-data"
        image_tag = get_latest_tag()
        image = f'{constants.IMAGE_REPOSITORY}:{image_tag}'

        data_config = {
            'metadata.namespace': constants.NAMESPACE,
            'spec.mode': 'cluster',
            'metadata.name': release_name,
            'spec.components.image': image,
            'spec.components.proxy.serviceType': 'LoadBalancer',
            'spec.components.dataNode.replicas': 2,
            'spec.config.common.retentionDuration': 60
        }
        mic = MilvusOperator()
        mic.install(data_config)
        if mic.wait_for_healthy(release_name,
                                constants.NAMESPACE,
                                timeout=1800):
            host = mic.endpoint(release_name,
                                constants.NAMESPACE).split(':')[0]
        else:
            raise MilvusException(message=f'Milvus healthy timeout 1800s')

        try:
            # connect
            connections.add_connection(default={"host": host, "port": 19530})
            connections.connect(alias='default')

            # create
            c_name = cf.gen_unique_str("scale_data")
            collection_w = ApiCollectionWrapper()
            collection_w.init_collection(
                name=c_name,
                schema=cf.gen_default_collection_schema(),
                shards_num=4)

            tmp_nb = 10000

            @counter
            def do_insert():
                """ do insert and flush """
                insert_res, is_succ = collection_w.insert(
                    cf.gen_default_dataframe_data(tmp_nb))
                log.debug(collection_w.num_entities)
                return insert_res, is_succ

            def loop_insert():
                """ loop do insert """
                while True:
                    do_insert()

            threading.Thread(target=loop_insert, args=(), daemon=True).start()

            # scale dataNode to 5
            mic.upgrade(release_name, {'spec.components.dataNode.replicas': 5},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")
            log.debug("Expand dataNode test finished")

            # create new collection and insert
            new_c_name = cf.gen_unique_str("scale_data")
            collection_w_new = ApiCollectionWrapper()
            collection_w_new.init_collection(
                name=new_c_name,
                schema=cf.gen_default_collection_schema(),
                shards_num=3)

            @counter
            def do_new_insert():
                """ do new insert """
                insert_res, is_succ = collection_w_new.insert(
                    cf.gen_default_dataframe_data(tmp_nb))
                log.debug(collection_w_new.num_entities)
                return insert_res, is_succ

            def loop_new_insert():
                """ loop new insert """
                while True:
                    do_new_insert()

            threading.Thread(target=loop_new_insert, args=(),
                             daemon=True).start()

            # scale dataNode to 3
            mic.upgrade(release_name, {'spec.components.dataNode.replicas': 3},
                        constants.NAMESPACE)
            mic.wait_for_healthy(release_name, constants.NAMESPACE)
            wait_pods_ready(constants.NAMESPACE,
                            f"app.kubernetes.io/instance={release_name}")

            log.debug(collection_w.num_entities)
            time.sleep(300)
            scale_common.check_succ_rate(do_insert)
            scale_common.check_succ_rate(do_new_insert)
            log.debug("Shrink dataNode test finished")

        except Exception as e:
            log.error(str(e))
            # raise Exception(str(e))

        finally:
            label = f"app.kubernetes.io/instance={release_name}"
            log.info('Start to export milvus pod logs')
            read_pod_log(namespace=constants.NAMESPACE,
                         label_selector=label,
                         release_name=release_name)

            mic.uninstall(release_name, namespace=constants.NAMESPACE)