Beispiel #1
0
def main():
    sentences = word2vec.Text8Corpus("text8")  # 加载语料
    model = word2vec.Word2Vec(sentences, size=200, window=5,
                              min_count=5)  # 训练模型
    word_set = model.wv.index2word  # 单词集合
    word_vec = model.wv.vectors  # word2vec结果向量集合
    milvus = Milvus()
    milvus.connect(host='localhost', port='19530')
    param = {
        'collection_name': 'word2vec',
        'dimension': 200,
        'index_file_size': 1024,
        'metric_type': MetricType.L2
    }
    milvus.create_collection(param)

    status, ids = milvus.insert(collection_name='word2vec', records=word_vec)

    # 单词分类
    ivf_param = {'nlist': 100}  # 分成100类
    milvus.create_index('word2vec', IndexType.IVF_FLAT, ivf_param)  # 增加索引
    status, index = milvus.describe_index(
        'word2vec')  # 相当于将word分成100个类别 做了聚类算法

    # 查找相似度最高的单词
    res = milvus.search(collection_name='word2vec',
                        query_records=[list(word_vec[word_set.index('king')])],
                        top_k=10,
                        params={'nprobe': 16})
    for i in range(10):
        id = res[1][0][i].id
        print(word_set[ids.index(id)])

    print(1)
Beispiel #2
0
    def _add_collection(self, collection: str, milvus: mv.Milvus, order_num: int = 0, remark: str = '') -> None:
        """
        添加collection

        @param {str} collection - 问题分类
        @param {mv.Milvus} milvus - Milvus连接对象
        @param {int} order_num=0 - 顺序号, 越大越优先
        @param {str} remark='' - 备注信息
        """
        if collection in self.sorted_collection:
            # 无需再添加
            self._log_debug('collection is exists: %s' % collection)
            return

        # 添加Milvus服务中的分类
        _status, _exists = milvus.has_collection(collection)
        self.confirm_milvus_status(_status, 'has_collection')

        if not _exists:
            # 创建分类
            _param = {
                'collection_name': collection,
                'dimension': self.dimension,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type,
            }
            self.confirm_milvus_status(
                milvus.create_collection(_param), 'create_collection'
            )

            self._log_debug('added Milvus collection [%s]' % collection)

            # 创建索引
            _index_param = {'nlist': self.nlist}
            self.confirm_milvus_status(
                milvus.create_index(collection, mv.IndexType.IVF_SQ8, _index_param),
                'create_index'
            )

            self._log_debug('added Milvus collection [%s] index' % collection)

        # 添加AnswerDB数据
        _order_num_match = (CollectionOrder.select()
                            .where(CollectionOrder.order_num == order_num)
                            .count())
        if _order_num_match > 0:
            # 如果顺序号有被占用的情况原来的记录大于当前序号的都要加1
            (CollectionOrder
             .update(order_num=CollectionOrder.order_num + 1)
             .where(CollectionOrder.order_num >= order_num)
             .execute())

        CollectionOrder.create(collection=collection, order_num=order_num, remark=remark)

        self._log_debug('insert collection [%s] to AnswerDB' % collection)

        # 将分类加入到内存排序队列中
        self.sorted_collection.append(collection)
Beispiel #3
0
    def test_not_connect(self):
        client = Milvus()

        with pytest.raises(NotConnectError):
            client.create_collection({})

        with pytest.raises(NotConnectError):
            client.has_collection("a")

        with pytest.raises(NotConnectError):
            client.describe_collection("a")

        with pytest.raises(NotConnectError):
            client.drop_collection("a")

        with pytest.raises(NotConnectError):
            client.create_index("a")

        with pytest.raises(NotConnectError):
            client.insert("a", [], None)

        with pytest.raises(NotConnectError):
            client.count_collection("a")

        with pytest.raises(NotConnectError):
            client.show_collections()

        with pytest.raises(NotConnectError):
            client.search("a", 1, 2, [], None)

        with pytest.raises(NotConnectError):
            client.search_in_files("a", [], [], 2, 1, None)

        with pytest.raises(NotConnectError):
            client._cmd("")

        with pytest.raises(NotConnectError):
            client.preload_collection("a")

        with pytest.raises(NotConnectError):
            client.describe_index("a")

        with pytest.raises(NotConnectError):
            client.drop_index("")
def create():
    _HOST = 'localhost'
    _PORT = '19530'
    _collection_name = 'chs_stars_faces_512'
    _DIM = 512  # dimension of vector
    _INDEX_FILE_SIZE = 256  # max file size of stored index

    milvus = Milvus(_HOST, _PORT)
    param = {
        'collection_name': _collection_name,
        'dimension': _DIM,
        'index_file_size': _INDEX_FILE_SIZE,  # optional
        'metric_type': MetricType.IP  # optional
    }


    milvus.create_collection(param)
    index_param = {
        'nlist': 2048  # 推荐 4 * sqrt(n)
    }

    status = milvus.create_index(_collection_name, IndexType.IVF_SQ8, index_param)

    # with open("chs_stars_features_pca.pickle", "rb") as f:
    #     pca = pickle.load(f)
    #
    # with open("../chs_stars_features_pca.csv", "w") as fw, open("../chs_stars_features.csv", "r") as fr:
    #     reader = csv.reader(fr)
    #     writer = csv.writer(fw)
    #     for index, line in enumerate(tqdm(reader)):
    #         star, fname, features = line
    #         features = np.array(json.loads(features))
    #         features = np.resize(features, (1, 512))
    #         features = normalize(features)
    #         features = pca.transform(features).squeeze()
    #         status, ids = milvus.insert(collection_name=_collection_name, records=[features.tolist()], ids=[index])
    #         if not status.OK():
    #             print(status)
    #             continue
    #         writer.writerow([index, star, fname, features])

    with open("../chs_stars_labels.csv", "w") as fw, open("../chs_stars_features.csv", "r") as fr:
        reader = csv.reader(fr)
        writer = csv.writer(fw)
        for index, line in enumerate(tqdm(reader)):
            star, fname, features = line
            # features = np.array(json.loads(features))
            # features = np.resize(features, (1, 512))
            #features = normalize(features)
            features = json.loads(features)
            status, ids = milvus.insert(collection_name=_collection_name, records=[features], ids=[index])
            if not status.OK():
                print(status)
                continue
            writer.writerow([index, star, fname])
Beispiel #5
0
def main():
    milvus = Milvus(uri=uri)
    param = {
        'collection_name': collection_name,
        'dimension': _DIM,
        'index_file_size': 32,
        #'metric_type': MetricType.IP
        'metric_type': MetricType.L2
    }
    # show collections in Milvus server
    _, collections = milvus.list_collections()

    # 创建 collection
    milvus.create_collection(param)
    # 创建 collection partion
    milvus.create_partition(collection_name, partition_tag)

    print(f'collections in Milvus: {collections}')
    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(f'descript demo_collection: {collection}')

    # build fake vectors
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    vectors1 = [[random.random() for _ in range(_DIM)] for _ in range(10)]

    status, id = milvus.insert(collection_name=collection_name,
                               records=vectors,
                               ids=list(range(10)),
                               partition_tag=partition_tag)
    print(f'status: {status} | id: {id}')
    if not status.OK():
        print(f"insert failded: {status}")

    status1, id1 = milvus.insert(collection_name=collection_name,
                                 records=vectors1,
                                 ids=list(range(10, 20)),
                                 partition_tag=partition_tag)
    print(f'status1: {status1} | id1: {id1}')

    ids_deleted = list(range(10))

    status_delete = milvus.delete_entity_by_id(collection_name=collection_name,
                                               id_array=ids_deleted)
    if status_delete.OK():
        print(f'delete successful')

    # Flush collection insered data to disk
    milvus.flush([collection_name])
    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)
    print(f"demo_collection row count: {result}")

    # Obtain raw vectors by providing vector ids
    status, result_vectors = milvus.get_entity_by_id(collection_name,
                                                     list(range(10, 20)))

    # create index of vectors, search more repidly
    index_param = {'nlist': 2}

    # create ivflat index in demo_collection
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)
    if status.OK():
        print(f"create index ivf_flat succeeed")

    # use the top 10 vectors for similarity search
    query_vectors = vectors1[0:2]

    # execute vector similariy search
    search_param = {"nprobe": 16}

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param
    }

    status, results = milvus.search(**param)
    if status.OK():
        if results[0][0].distance == 0.0:
            print('query result is correct')
        else:
            print('not correct')
        print(results)
    else:
        print(f'search failed: {status}')

    # 清除已经存在的collection
    milvus.drop_collection(collection_name=collection_name)

    milvus.close()
Beispiel #6
0
class MilvusClient(object):
    def __init__(self,
                 collection_name=None,
                 host=None,
                 port=None,
                 timeout=180):
        self._collection_name = collection_name
        start_time = time.time()
        if not host:
            host = SERVER_HOST_DEFAULT
        if not port:
            port = SERVER_PORT_DEFAULT
        logger.debug(host)
        logger.debug(port)
        # retry connect remote server
        i = 0
        while time.time() < start_time + timeout:
            try:
                self._milvus = Milvus(host=host,
                                      port=port,
                                      try_connect=False,
                                      pre_ping=False)
                break
            except Exception as e:
                logger.error(str(e))
                logger.error("Milvus connect failed: %d times" % i)
                i = i + 1
                time.sleep(i)

        if time.time() > start_time + timeout:
            raise Exception("Server connect timeout")
        # self._metric_type = None

    def __str__(self):
        return 'Milvus collection %s' % self._collection_name

    def check_status(self, status):
        if not status.OK():
            logger.error(status.message)
            logger.error(self._milvus.server_status())
            logger.error(self.count())
            raise Exception("Status not ok")

    def check_result_ids(self, result):
        for index, item in enumerate(result):
            if item[0].distance >= epsilon:
                logger.error(index)
                logger.error(item[0].distance)
                raise Exception("Distance wrong")

    # only support the given field name
    def create_collection(self,
                          dimension,
                          data_type=DataType.FLOAT_VECTOR,
                          auto_id=False,
                          collection_name=None,
                          other_fields=None):
        self._dimension = dimension
        if not collection_name:
            collection_name = self._collection_name
        vec_field_name = utils.get_default_field_name(data_type)
        fields = [{
            "name": vec_field_name,
            "type": data_type,
            "params": {
                "dim": dimension
            }
        }]
        if other_fields:
            other_fields = other_fields.split(",")
            if "int" in other_fields:
                fields.append({
                    "name": utils.DEFAULT_INT_FIELD_NAME,
                    "type": DataType.INT64
                })
            if "float" in other_fields:
                fields.append({
                    "name": utils.DEFAULT_FLOAT_FIELD_NAME,
                    "type": DataType.FLOAT
                })
        create_param = {"fields": fields, "auto_id": auto_id}
        try:
            self._milvus.create_collection(collection_name, create_param)
            logger.info("Create collection: <%s> successfully" %
                        collection_name)
        except Exception as e:
            logger.error(str(e))
            raise

    def create_partition(self, tag, collection_name=None):
        if not collection_name:
            collection_name = self._collection_name
        self._milvus.create_partition(collection_name, tag)

    def generate_values(self, data_type, vectors, ids):
        values = None
        if data_type in [DataType.INT32, DataType.INT64]:
            values = ids
        elif data_type in [DataType.FLOAT, DataType.DOUBLE]:
            values = [(i + 0.0) for i in ids]
        elif data_type in [DataType.FLOAT_VECTOR, DataType.BINARY_VECTOR]:
            values = vectors
        return values

    def generate_entities(self, vectors, ids=None, collection_name=None):
        entities = []
        if collection_name is None:
            collection_name = self._collection_name
        info = self.get_info(collection_name)
        for field in info["fields"]:
            field_type = field["type"]
            entities.append({
                "name":
                field["name"],
                "type":
                field_type,
                "values":
                self.generate_values(field_type, vectors, ids)
            })
        return entities

    @time_wrapper
    def insert(self, entities, ids=None, collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        try:
            insert_ids = self._milvus.insert(tmp_collection_name,
                                             entities,
                                             ids=ids)
            return insert_ids
        except Exception as e:
            logger.error(str(e))

    def get_dimension(self):
        info = self.get_info()
        for field in info["fields"]:
            if field["type"] in [
                    DataType.FLOAT_VECTOR, DataType.BINARY_VECTOR
            ]:
                return field["params"]["dim"]

    def get_rand_ids(self, length):
        segment_ids = []
        while True:
            stats = self.get_stats()
            segments = stats["partitions"][0]["segments"]
            # random choice one segment
            segment = random.choice(segments)
            try:
                segment_ids = self._milvus.list_id_in_segment(
                    self._collection_name, segment["id"])
            except Exception as e:
                logger.error(str(e))
            if not len(segment_ids):
                continue
            elif len(segment_ids) > length:
                return random.sample(segment_ids, length)
            else:
                logger.debug("Reset length: %d" % len(segment_ids))
                return segment_ids

    # def get_rand_ids_each_segment(self, length):
    #     res = []
    #     status, stats = self._milvus.get_collection_stats(self._collection_name)
    #     self.check_status(status)
    #     segments = stats["partitions"][0]["segments"]
    #     segments_num = len(segments)
    #     # random choice from each segment
    #     for segment in segments:
    #         status, segment_ids = self._milvus.list_id_in_segment(self._collection_name, segment["name"])
    #         self.check_status(status)
    #         res.extend(segment_ids[:length])
    #     return segments_num, res

    # def get_rand_entities(self, length):
    #     ids = self.get_rand_ids(length)
    #     status, get_res = self._milvus.get_entity_by_id(self._collection_name, ids)
    #     self.check_status(status)
    #     return ids, get_res

    def get(self):
        get_ids = random.randint(1, 1000000)
        self._milvus.get_entity_by_id(self._collection_name, [get_ids])

    @time_wrapper
    def get_entities(self, get_ids):
        get_res = self._milvus.get_entity_by_id(self._collection_name, get_ids)
        return get_res

    @time_wrapper
    def delete(self, ids, collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        self._milvus.delete_entity_by_id(tmp_collection_name, ids)

    def delete_rand(self):
        delete_id_length = random.randint(1, 100)
        count_before = self.count()
        logger.debug("%s: length to delete: %d" %
                     (self._collection_name, delete_id_length))
        delete_ids = self.get_rand_ids(delete_id_length)
        self.delete(delete_ids)
        self.flush()
        logger.info("%s: count after delete: %d" %
                    (self._collection_name, self.count()))
        get_res = self._milvus.get_entity_by_id(self._collection_name,
                                                delete_ids)
        for item in get_res:
            assert not item
        # if count_before - len(delete_ids) < self.count():
        #     logger.error(delete_ids)
        #     raise Exception("Error occured")

    @time_wrapper
    def flush(self, _async=False, collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        self._milvus.flush([tmp_collection_name], _async=_async)

    @time_wrapper
    def compact(self, collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        status = self._milvus.compact(tmp_collection_name)
        self.check_status(status)

    @time_wrapper
    def create_index(self,
                     field_name,
                     index_type,
                     metric_type,
                     _async=False,
                     index_param=None):
        index_type = INDEX_MAP[index_type]
        metric_type = utils.metric_type_trans(metric_type)
        logger.info(
            "Building index start, collection_name: %s, index_type: %s, metric_type: %s"
            % (self._collection_name, index_type, metric_type))
        if index_param:
            logger.info(index_param)
        index_params = {
            "index_type": index_type,
            "metric_type": metric_type,
            "params": index_param
        }
        self._milvus.create_index(self._collection_name,
                                  field_name,
                                  index_params,
                                  _async=_async)

    # TODO: need to check
    def describe_index(self, field_name):
        # stats = self.get_stats()
        info = self._milvus.describe_index(self._collection_name, field_name)
        index_info = {"index_type": "flat", "index_param": None}
        for field in info["fields"]:
            for index in field['indexes']:
                if not index or "index_type" not in index:
                    continue
                else:
                    for k, v in INDEX_MAP.items():
                        if index['index_type'] == v:
                            index_info['index_type'] = k
                            index_info['index_param'] = index['params']
                            return index_info
        return index_info

    def drop_index(self, field_name):
        logger.info("Drop index: %s" % self._collection_name)
        return self._milvus.drop_index(self._collection_name, field_name)

    @time_wrapper
    def query(self, vector_query, filter_query=None, collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        must_params = [vector_query]
        if filter_query:
            must_params.extend(filter_query)
        query = {"bool": {"must": must_params}}
        result = self._milvus.search(tmp_collection_name, query)
        return result

    @time_wrapper
    def load_and_query(self,
                       vector_query,
                       filter_query=None,
                       collection_name=None):
        tmp_collection_name = self._collection_name if collection_name is None else collection_name
        must_params = [vector_query]
        if filter_query:
            must_params.extend(filter_query)
        query = {"bool": {"must": must_params}}
        self.load_collection(tmp_collection_name)
        result = self._milvus.search(tmp_collection_name, query)
        return result

    def get_ids(self, result):
        idss = result._entities.ids
        ids = []
        len_idss = len(idss)
        len_r = len(result)
        top_k = len_idss // len_r
        for offset in range(0, len_idss, top_k):
            ids.append(idss[offset:min(offset + top_k, len_idss)])
        return ids

    def query_rand(self, nq_max=100):
        # for ivf search
        dimension = 128
        top_k = random.randint(1, 100)
        nq = random.randint(1, nq_max)
        nprobe = random.randint(1, 100)
        search_param = {"nprobe": nprobe}
        query_vectors = [[random.random() for _ in range(dimension)]
                         for _ in range(nq)]
        metric_type = random.choice(["l2", "ip"])
        logger.info("%s, Search nq: %d, top_k: %d, nprobe: %d" %
                    (self._collection_name, nq, top_k, nprobe))
        vec_field_name = utils.get_default_field_name()
        vector_query = {
            "vector": {
                vec_field_name: {
                    "topk": top_k,
                    "query": query_vectors,
                    "metric_type": utils.metric_type_trans(metric_type),
                    "params": search_param
                }
            }
        }
        self.query(vector_query)

    def load_query_rand(self, nq_max=100):
        # for ivf search
        dimension = 128
        top_k = random.randint(1, 100)
        nq = random.randint(1, nq_max)
        nprobe = random.randint(1, 100)
        search_param = {"nprobe": nprobe}
        query_vectors = [[random.random() for _ in range(dimension)]
                         for _ in range(nq)]
        metric_type = random.choice(["l2", "ip"])
        logger.info("%s, Search nq: %d, top_k: %d, nprobe: %d" %
                    (self._collection_name, nq, top_k, nprobe))
        vec_field_name = utils.get_default_field_name()
        vector_query = {
            "vector": {
                vec_field_name: {
                    "topk": top_k,
                    "query": query_vectors,
                    "metric_type": utils.metric_type_trans(metric_type),
                    "params": search_param
                }
            }
        }
        self.load_and_query(vector_query)

    # TODO: need to check
    def count(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        row_count = self._milvus.get_collection_stats(
            collection_name)["row_count"]
        logger.debug("Row count: %d in collection: <%s>" %
                     (row_count, collection_name))
        return row_count

    def drop(self, timeout=120, collection_name=None):
        timeout = int(timeout)
        if collection_name is None:
            collection_name = self._collection_name
        logger.info("Start delete collection: %s" % collection_name)
        self._milvus.drop_collection(collection_name)
        i = 0
        while i < timeout:
            try:
                row_count = self.count(collection_name=collection_name)
                if row_count:
                    time.sleep(1)
                    i = i + 1
                    continue
                else:
                    break
            except Exception as e:
                logger.debug(str(e))
                break
        if i >= timeout:
            logger.error("Delete collection timeout")

    def get_stats(self):
        return self._milvus.get_collection_stats(self._collection_name)

    def get_info(self, collection_name=None):
        # pdb.set_trace()
        if collection_name is None:
            collection_name = self._collection_name
        return self._milvus.get_collection_info(collection_name)

    def show_collections(self):
        return self._milvus.list_collections()

    def exists_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        res = self._milvus.has_collection(collection_name)
        return res

    def clean_db(self):
        collection_names = self.show_collections()
        for name in collection_names:
            self.drop(collection_name=name)

    @time_wrapper
    def load_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        return self._milvus.load_collection(collection_name, timeout=3000)

    @time_wrapper
    def release_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        return self._milvus.release_collection(collection_name, timeout=3000)

    @time_wrapper
    def load_partitions(self, tag_names, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        return self._milvus.load_partitions(collection_name,
                                            tag_names,
                                            timeout=3000)

    @time_wrapper
    def release_partitions(self, tag_names, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        return self._milvus.release_partitions(collection_name,
                                               tag_names,
                                               timeout=3000)
Beispiel #7
0
# Basic create index:
#     Now that we have inserted all the films into Milvus, we are going to build index with these data.
#
#     While building index, we have to indicate which `field` to build index for, the `index_type`,
#     `metric_type` and params for the specific index type. In our case, we want to build a `IVF_FLAT`
#     index, so the specific params are "nlist". See pymilvus documentation
#     (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html) for `index_type` we
#     support and the params accordingly.
#
#     If there are already index for a collection and you call `create_index` with different params, the
#     older index will be replaced by new one.
# ------
client.create_index(collection_name, "embedding", {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {
        "nlist": 100
    }
})

# ------
# Basic create index:
#     We can get the detail of the index  by `get_collection_info`.
# ------
info = client.get_collection_info(collection_name)
pprint(info)

# ------
# Basic hybrid search entities:
#     If we want to use index, the specific index params need to be provided, in our case, the "params"
#     should be "nprobe", if no "params" given, Milvus will complain about it and raise a exception.
Beispiel #8
0
class MilvusHelper:
    def __init__(self):
        try:
            self.client = Milvus(host=MILVUS_HOST, port=MILVUS_PORT)
            LOGGER.debug(
                "Successfully connect to Milvus with IP:{} and PORT:{}".format(
                    MILVUS_HOST, MILVUS_PORT))
        except Exception as e:
            LOGGER.error("Failed to connect Milvus: {}".format(e))
            sys.exit(1)

    # Return if Milvus has the collection
    def has_collection(self, collection_name):
        try:
            status = self.client.has_collection(collection_name)[1]
            return status
        except Exception as e:
            LOGGER.error("Failed to load data to Milvus: {}".format(e))
            sys.exit(1)

    # Create milvus collection if not exists
    def create_colllection(self, collection_name):
        try:
            if not self.has_collection(collection_name):
                collection_param = {
                    'collection_name': collection_name,
                    'dimension': VECTOR_DIMENSION,
                    'index_file_size': INDEX_FILE_SIZE,
                    'metric_type': METRIC_TYPE
                }
                status = self.client.create_collection(collection_param)
                if status.code != 0:
                    raise Exception(status.message)
                LOGGER.debug(
                    "Create Milvus collection: {}".format(collection_name))
        except Exception as e:
            LOGGER.error("Failed to load data to Milvus: {}".format(e))
            sys.exit(1)

    # Batch insert vectors to milvus collection
    def insert(self, collection_name, vectors):
        try:
            self.create_colllection(collection_name)
            status, ids = self.client.insert(collection_name=collection_name,
                                             records=vectors)
            if not status.code:
                LOGGER.debug(
                    "Insert vectors to Milvus in collection: {} with {} rows".
                    format(collection_name, len(vectors)))
                return ids
            else:
                raise Exception(status.message)
        except Exception as e:
            LOGGER.error("Failed to load data to Milvus: {}".format(e))
            sys.exit(1)

    # Create IVF_FLAT index on milvus collection
    def create_index(self, collection_name):
        try:
            index_param = {'nlist': 16384}
            status = self.client.create_index(collection_name,
                                              IndexType.IVF_FLAT, index_param)
            if not status.code:
                LOGGER.debug(
                    "Successfully create index in collection:{} with param:{}".
                    format(collection_name, index_param))
                return status
            else:
                raise Exception(status.message)
        except Exception as e:
            LOGGER.error("Failed to create index: {}".format(e))
            sys.exit(1)

    # Delete Milvus collection
    def delete_collection(self, collection_name):
        try:
            status = self.client.drop_collection(
                collection_name=collection_name)
            if not status.code:
                LOGGER.debug(
                    "Successfully drop collection: {}".format(collection_name))
                return status
            else:
                raise Exception(status.message)
        except Exception as e:
            LOGGER.error("Failed to drop collection: {}".format(e))
            sys.exit(1)

    # Search vector in milvus collection
    def search_vectors(self, collection_name, vectors, top_k):
        try:
            search_param = {'nprobe': 16}
            status, result = self.client.search(
                collection_name=collection_name,
                query_records=vectors,
                top_k=top_k,
                params=search_param)
            if not status.code:
                LOGGER.debug("Successfully search in collection: {}".format(
                    collection_name))
                return result
            else:
                raise Exception(status.message)
        except Exception as e:
            LOGGER.error("Failed to search vectors in Milvus: {}".format(e))
            sys.exit(1)

    # Get the number of milvus collection
    def count(self, collection_name):
        try:
            status, num = self.client.count_entities(
                collection_name=collection_name)
            if not status.code:
                LOGGER.debug(
                    "Successfully get the num:{} of the collection:{}".format(
                        num, collection_name))
                return num
            else:
                raise Exception(status.message)
        except Exception as e:
            LOGGER.error("Failed to count vectors in Milvus: {}".format(e))
            sys.exit(1)
Beispiel #9
0
def main():
    # Specify server addr when create milvus client instance
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_async_collection_'

    status, ok = milvus.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': 128,  # optional
            'metric_type': MetricType.L2  # optional
        }

        status = milvus.create_collection(param)
        if not status.OK():
            print("Create collection failed: {}".format(status.message),
                  file=sys.stderr)
            print("exiting ...", file=sys.stderr)
            sys.exit(1)

    # Show collections in Milvus server
    _, collections = milvus.list_collections()

    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(collection)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(100000)]

    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32)`

    def _insert_callback(status, ids):
        if status.OK():
            print("Insert successfully")
        else:
            print("Insert failed.", status.message)

    # Insert vectors into demo_collection, adding callback function
    insert_future = milvus.insert(collection_name=collection_name,
                                  records=vectors,
                                  _async=True,
                                  _callback=_insert_callback)
    # Or invoke result() to get results:
    #   insert_future = milvus.insert(collection_name=collection_name, records=vectors, _async=True)
    #   status, ids = insert_future.result()
    insert_future.done()

    # Flush collection  inserted data to disk.
    def _flush_callback(status):
        if status.OK():
            print("Flush successfully")
        else:
            print("Flush failed.", status.message)

    flush_future = milvus.flush([collection_name],
                                _async=True,
                                _callback=_flush_callback)
    # Or invoke result() to get results:
    #   flush_future = milvus.flush([collection_name], _async=True)
    #   status = flush_future.result()
    flush_future.done()

    def _compact_callback(status):
        if status.OK():
            print("Compact successfully")
        else:
            print("Compact failed.", status.message)

    compact_furure = milvus.compact(collection_name,
                                    _async=True,
                                    _cakkback=_compact_callback)
    # Or invoke result() to get results:
    #   compact_future = milvus.compact(collection_name, _async=True)
    #   status = compact_future.result()
    compact_furure.done()

    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)

    # present collection info
    _, info = milvus.get_collection_stats(collection_name)
    print(info)

    # create index of vectors, search more rapidly
    index_param = {'nlist': 2048}

    def _index_callback(status):
        if status.OK():
            print("Create index successfully")
        else:
            print("Create index failed.", status.message)

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    index_future = milvus.create_index(collection_name,
                                       IndexType.IVF_FLAT,
                                       index_param,
                                       _async=True,
                                       _callback=_index_callback)
    # Or invoke result() to get results:
    #   index_future = milvus.create_index(collection_name, IndexType.IVF_FLAT, index_param, _async=True)
    #   status = index_future.result()
    index_future.done()

    # describe index, get information of index
    status, index = milvus.get_index_info(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search
    search_param = {"nprobe": 16}

    print("Searching ... ")

    def _search_callback(status, results):
        # if status.OK():
        #     print("Search successfully")
        # else:
        #     print("Search failed.", status.message)
        if status.OK():
            # indicate search result
            # also use by:
            #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
            if results[0][0].distance == 0.0:  # or results[0][0].id == ids[0]:
                print('Query result is correct')
            else:
                print('Query result isn\'t correct')

            # print results
            print(results)
        else:
            print("Search failed. ", status)

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param,
        "_async": True,
        "_callback": _search_callback
    }
    search_future = milvus.search(**param)
    # Or invoke result() to get results:
    #
    #   param = {
    #       'collection_name': collection_name,
    #       'query_records': query_vectors,
    #       'top_k': 1,
    #       'params': search_param,
    #       "_async": True,
    #   }
    #   search_future = milvus.search(param)
    #   status, results = index_future.result()

    search_future.done()

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)
Beispiel #10
0
class MilvusClient(object):
    def __init__(self, table_name=None, ip=None, port=None):
        self._milvus = Milvus()
        self._table_name = table_name
        try:
            if not ip:
                self._milvus.connect(
                    host = SERVER_HOST_DEFAULT,
                    port = SERVER_PORT_DEFAULT)
            else:
                self._milvus.connect(
                    host = ip,
                    port = port)
        except Exception as e:
            raise e

    def __str__(self):
        return 'Milvus table %s' % self._table_name

    def check_status(self, status):
        if not status.OK():
            logger.error(status.message)
            raise Exception("Status not ok")

    def create_table(self, table_name, dimension, index_file_size, metric_type):
        if not self._table_name:
            self._table_name = table_name
        if metric_type == "l2":
            metric_type = MetricType.L2
        elif metric_type == "ip":
            metric_type = MetricType.IP
        else:
            logger.error("Not supported metric_type: %s" % metric_type)
        create_param = {'table_name': table_name,
                 'dimension': dimension,
                 'index_file_size': index_file_size, 
                 "metric_type": metric_type}
        status = self._milvus.create_table(create_param)
        self.check_status(status)

    @time_wrapper
    def insert(self, X, ids=None):
        status, result = self._milvus.add_vectors(self._table_name, X, ids)
        self.check_status(status)
        return status, result

    @time_wrapper
    def create_index(self, index_type, nlist):
        if index_type == "flat":
            index_type = IndexType.FLAT
        elif index_type == "ivf_flat":
            index_type = IndexType.IVFLAT
        elif index_type == "ivf_sq8":
            index_type = IndexType.IVF_SQ8
        elif index_type == "nsg":
            index_type = IndexType.NSG
        elif index_type == "ivf_sq8h":
            index_type = IndexType.IVF_SQ8H
        elif index_type == "ivf_pq":
            index_type = IndexType.IVF_PQ
        index_params = {
            "index_type": index_type,
            "nlist": nlist,
        }
        logger.info("Building index start, table_name: %s, index_params: %s" % (self._table_name, json.dumps(index_params)))
        status = self._milvus.create_index(self._table_name, index=index_params)
        self.check_status(status)

    def describe_index(self):
        return self._milvus.describe_index(self._table_name)

    def drop_index(self):
        logger.info("Drop index: %s" % self._table_name)
        return self._milvus.drop_index(self._table_name)

    @time_wrapper
    def query(self, X, top_k, nprobe):
        status, result = self._milvus.search_vectors(self._table_name, top_k, nprobe, X)
        self.check_status(status)
        return status, result

    def count(self):
        return self._milvus.get_table_row_count(self._table_name)[1]

    def delete(self, timeout=60):
        logger.info("Start delete table: %s" % self._table_name)
        self._milvus.delete_table(self._table_name)
        i = 0
        while i < timeout:
            if self.count():
                time.sleep(1)
                i = i + 1
                continue
            else:
                break
        if i < timeout:
            logger.error("Delete table timeout")

    def describe(self):
        return self._milvus.describe_table(self._table_name)

    def show_tables(self):
        return self._milvus.show_tables()

    def exists_table(self):
        status, res = self._milvus.has_table(self._table_name)
        self.check_status(status)
        return res

    @time_wrapper
    def preload_table(self):
        return self._milvus.preload_table(self._table_name, timeout=3000)
Beispiel #11
0
milvus = Milvus(host='localhost', port='19530')

dim = 512
col_name = 'faces'
param = {
    'collection_name': col_name,
    'dimension': dim,
    'index_file_size': 1024,
    'metric_type': MetricType.L2
}
milvus.create_collection(param=param)

ivf_param = {'nlist': 16384}
milvus.create_index(collection_name=col_name,
                    index_type=IndexType.IVF_FLAT,
                    params=ivf_param)

vectors = [[random.random() for _ in range(dim)] for _ in range(2000)]
vector_ids = list(range(2000))
_, ids = milvus.insert(collection_name=col_name,
                       records=vectors,
                       ids=vector_ids)
# print(ids)

time.sleep(1)
search_param = {'nprobe': 16}
q_records = [[random.random() for _ in range(dim)] for _ in range(5)]
_, result = milvus.search(collection_name=col_name,
                          query_records=q_records,
                          top_k=2,
Beispiel #12
0
class Indexer:
    '''
    索引器。
    '''
    def __init__(self, name, host='127.0.0.1', port='19531'):
        '''
        初始化。
        '''
        self.client = Milvus(host=host, port=port)
        self.collection = name

    def init(self, lenient=False):
        '''
        创建集合。
        '''
        if lenient:
            status, result = self.client.has_collection(
                collection_name=self.collection)
            if status.code != 0:
                raise ExertMilvusException(status)
            if result:
                return

        status = self.client.create_collection({
            'collection_name': self.collection,
            'dimension': 512,
            'index_file_size': 1024,
            'metric_type': MetricType.L2
        })
        if status.code != 0 and not (lenient and status.code == 9):
            raise ExertMilvusException(status)

        # 创建索引。
        status = self.client.create_index(collection_name=self.collection,
                                          index_type=IndexType.IVF_FLAT,
                                          params={'nlist': 16384})
        if status.code != 0:
            raise ExertMilvusException(status)

        return status

    def drop(self):
        '''
        删除集合。
        '''
        status = self.client.drop_collection(collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)

    def flush(self):
        '''
        写入到硬盘。
        '''
        status = self.client.flush([self.collection])
        if status.code != 0:
            raise ExertMilvusException(status)

    def compact(self):
        '''
        压缩集合。
        '''
        status = self.client.compact(collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)

    def close(self):
        '''
        关闭链接。
        '''
        self.client.close()

    def new_tag(self, tag):
        '''
        建分块标签。
        '''
        status = self.client.create_partition(collection_name=self.collection,
                                              partition_tag=tag)
        if status.code != 0:
            raise ExertMilvusException(status)

    def list_tag(self):
        '''
        列举分块标签。
        '''
        status, result = self.client.list_partitions(
            collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def drop_tag(self, tag):
        '''
        删除分块标签。
        '''
        status = self.client.drop_partition(collection_name=self.collection,
                                            partition_tag=tag)
        if status.code != 0:
            raise ExertMilvusException(status)

    def index(self, vectors, tag=None, ids=None):
        '''
        添加索引
        '''
        params = {}
        if tag != None:
            params['tag'] = tag
        if ids != None:
            params['ids'] = ids
        status, result = self.client.insert(collection_name=self.collection,
                                            records=vectors,
                                            **params)
        if status.code != 0:
            raise ExertMilvusException(status)

        return result

    def listing(self, ids):
        '''
        列举信息。
        '''
        status, result = self.client.get_entity_by_id(
            collection_name=self.collection, ids=ids)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def counting(self):
        '''
        计算索引数。
        '''
        status, result = self.client.count_entities(
            collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def unindex(self, ids):
        '''
        去掉索引。
        '''
        status = self.client.delete_entity_by_id(
            collection_name=self.collection, id_array=ids)
        if status.code != 0:
            raise ExertMilvusException(status)

    def search(self, vectors, top_count=100, tags=None):
        '''
        搜索。
        '''
        params = {'params': {'nprobe': 16}}
        if tags != None:
            params['partition_tags'] = tags
        status, results = self.client.search(collection_name=self.collection,
                                             query_records=vectors,
                                             top_k=top_count,
                                             **params)
        if status.code != 0:
            raise ExertMilvusException(status)
        return results
Beispiel #13
0
class MilvusClient(object):
    def __init__(self, collection_name=None, host=None, port=None, timeout=60):
        """
        Milvus client wrapper for python-sdk.

        Default timeout set 60s
        """
        self._collection_name = collection_name
        try:
            start_time = time.time()
            if not host:
                host = SERVER_HOST_DEFAULT
            if not port:
                port = SERVER_PORT_DEFAULT
            logger.debug(host)
            logger.debug(port)
            # retry connect for remote server
            i = 0
            while time.time() < start_time + timeout:
                try:
                    self._milvus = Milvus(host=host, port=port, try_connect=False, pre_ping=False)
                    if self._milvus.server_status():
                        logger.debug("Try connect times: %d, %s" % (i, round(time.time() - start_time, 2)))
                        break
                except Exception as e:
                    logger.debug("Milvus connect failed: %d times" % i)
                    i = i + 1

            if time.time() > start_time + timeout:
                raise Exception("Server connect timeout")

        except Exception as e:
            raise e
        self._metric_type = None
        if self._collection_name and self.exists_collection():
            self._metric_type = metric_type_to_str(self.describe()[1].metric_type)
            self._dimension = self.describe()[1].dimension

    def __str__(self):
        return 'Milvus collection %s' % self._collection_name

    def set_collection(self, name):
        self._collection_name = name

    def check_status(self, status):
        if not status.OK():
            logger.error(self._collection_name)
            logger.error(status.message)
            logger.error(self._milvus.server_status())
            logger.error(self.count())
            raise Exception("Status not ok")

    def check_result_ids(self, result):
        for index, item in enumerate(result):
            if item[0].distance >= epsilon:
                logger.error(index)
                logger.error(item[0].distance)
                raise Exception("Distance wrong")

    def create_collection(self, collection_name, dimension, index_file_size, metric_type):
        if not self._collection_name:
            self._collection_name = collection_name
        if metric_type not in METRIC_MAP.keys():
            raise Exception("Not supported metric_type: %s" % metric_type)
        metric_type = METRIC_MAP[metric_type]
        create_param = {'collection_name': collection_name,
                 'dimension': dimension,
                 'index_file_size': index_file_size, 
                 "metric_type": metric_type}
        status = self._milvus.create_collection(create_param)
        self.check_status(status)

    def create_partition(self, tag_name):
        status = self._milvus.create_partition(self._collection_name, tag_name)
        self.check_status(status)

    def drop_partition(self, tag_name):
        status = self._milvus.drop_partition(self._collection_name, tag_name)
        self.check_status(status)

    def list_partitions(self):
        status, tags = self._milvus.list_partitions(self._collection_name)
        self.check_status(status)
        return tags

    @time_wrapper
    def insert(self, X, ids=None, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status, result = self._milvus.insert(collection_name, X, ids)
        self.check_status(status)
        return status, result

    def insert_rand(self):
        insert_xb = random.randint(1, 100)
        X = [[random.random() for _ in range(self._dimension)] for _ in range(insert_xb)]
        X = utils.normalize(self._metric_type, X)
        count_before = self.count()
        status, _ = self.insert(X)
        self.check_status(status)
        self.flush()
        if count_before + insert_xb != self.count():
            raise Exception("Assert failed after inserting")

    def get_rand_ids(self, length):
        while True:
            status, stats = self._milvus.get_collection_stats(self._collection_name)
            self.check_status(status)
            segments = stats["partitions"][0]["segments"]
            # random choice one segment
            segment = random.choice(segments)
            status, segment_ids = self._milvus.list_id_in_segment(self._collection_name, segment["name"])
            if not status.OK():
                logger.error(status.message)
                continue
            if len(segment_ids):
                break
        if length >= len(segment_ids):
            logger.debug("Reset length: %d" % len(segment_ids))
            return segment_ids
        return random.sample(segment_ids, length)

    def get_rand_ids_each_segment(self, length):
        res = []
        status, stats = self._milvus.get_collection_stats(self._collection_name)
        self.check_status(status)
        segments = stats["partitions"][0]["segments"]
        segments_num = len(segments)
        # random choice from each segment
        for segment in segments:
            status, segment_ids = self._milvus.list_id_in_segment(self._collection_name, segment["name"])
            self.check_status(status)
            res.extend(segment_ids[:length])
        return segments_num, res

    def get_rand_entities(self, length):
        ids = self.get_rand_ids(length)
        status, get_res = self._milvus.get_entity_by_id(self._collection_name, ids)
        self.check_status(status)
        return ids, get_res

    @time_wrapper
    def get_entities(self, get_ids):
        status, get_res = self._milvus.get_entity_by_id(self._collection_name, get_ids)
        self.check_status(status)
        return get_res

    @time_wrapper
    def delete(self, ids, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status = self._milvus.delete_entity_by_id(collection_name, ids)
        self.check_status(status)

    def delete_rand(self):
        delete_id_length = random.randint(1, 100)
        count_before = self.count()
        logger.info("%s: length to delete: %d" % (self._collection_name, delete_id_length))
        delete_ids = self.get_rand_ids(delete_id_length)
        self.delete(delete_ids)
        self.flush()
        logger.info("%s: count after delete: %d" % (self._collection_name, self.count()))
        status, get_res = self._milvus.get_entity_by_id(self._collection_name, delete_ids)
        self.check_status(status)
        for item in get_res:
            if item:
                raise Exception("Assert failed after delete")
        if count_before - len(delete_ids) != self.count():
            raise Exception("Assert failed after delete")

    @time_wrapper
    def flush(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status = self._milvus.flush([collection_name])
        self.check_status(status)

    @time_wrapper
    def compact(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status = self._milvus.compact(collection_name)
        self.check_status(status)

    @time_wrapper
    def create_index(self, index_type, index_param=None):
        index_type = INDEX_MAP[index_type]
        logger.info("Building index start, collection_name: %s, index_type: %s" % (self._collection_name, index_type))
        if index_param:
            logger.info(index_param)
        status = self._milvus.create_index(self._collection_name, index_type, index_param)
        self.check_status(status)

    def describe_index(self):
        status, result = self._milvus.get_index_info(self._collection_name)
        self.check_status(status)
        index_type = None
        for k, v in INDEX_MAP.items():
            if result._index_type == v:
                index_type = k
                break
        return {"index_type": index_type, "index_param": result._params}

    def drop_index(self):
        logger.info("Drop index: %s" % self._collection_name)
        return self._milvus.drop_index(self._collection_name)

    def query(self, X, top_k, search_param=None, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status, result = self._milvus.search(collection_name, top_k, query_records=X, params=search_param)
        self.check_status(status)
        return result

    def query_rand(self):
        top_k = random.randint(1, 100)
        nq = random.randint(1, 100)
        nprobe = random.randint(1, 100)
        search_param = {"nprobe": nprobe}
        _, X = self.get_rand_entities(nq)
        logger.info("%s, Search nq: %d, top_k: %d, nprobe: %d" % (self._collection_name, nq, top_k, nprobe))
        status, _ = self._milvus.search(self._collection_name, top_k, query_records=X, params=search_param)
        self.check_status(status)
        # for i, item in enumerate(search_res):
        #     if item[0].id != ids[i]:
        #         logger.warning("The index of search result: %d" % i)
        #         raise Exception("Query failed")

    # @time_wrapper
    # def query_ids(self, top_k, ids, search_param=None):
    #     status, result = self._milvus.search_by_id(self._collection_name, ids, top_k, params=search_param)
    #     self.check_result_ids(result)
    #     return result

    def count(self, name=None):
        if name is None:
            name = self._collection_name
        logger.debug(self._milvus.count_entities(name))
        row_count = self._milvus.count_entities(name)[1]
        if not row_count:
            row_count = 0
        logger.debug("Row count: %d in collection: <%s>" % (row_count, name))
        return row_count

    def drop(self, timeout=120, name=None):
        timeout = int(timeout)
        if name is None:
            name = self._collection_name
        logger.info("Start delete collection: %s" % name)
        status = self._milvus.drop_collection(name)
        self.check_status(status)
        i = 0
        while i < timeout:
            if self.count(name=name):
                time.sleep(1)
                i = i + 1
                continue
            else:
                break
        if i >= timeout:
            logger.error("Delete collection timeout")

    def describe(self):
        # logger.info(self._milvus.get_collection_info(self._collection_name))
        return self._milvus.get_collection_info(self._collection_name)

    def show_collections(self):
        return self._milvus.list_collections()

    def exists_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        _, res = self._milvus.has_collection(collection_name)
        # self.check_status(status)
        return res

    def clean_db(self):
        collection_names = self.show_collections()[1]
        for name in collection_names:
            logger.debug(name)
            self.drop(name=name)

    @time_wrapper
    def preload_collection(self):
        status = self._milvus.load_collection(self._collection_name, timeout=3000)
        self.check_status(status)
        return status

    def get_server_version(self):
        _, res = self._milvus.server_version()
        return res

    def get_server_mode(self):
        return self.cmd("mode")

    def get_server_commit(self):
        return self.cmd("build_commit_id")

    def get_server_config(self):
        return json.loads(self.cmd("get_config *"))

    def get_mem_info(self):
        result = json.loads(self.cmd("get_system_info"))
        result_human = {
            # unit: Gb
            "memory_used": round(int(result["memory_used"]) / (1024*1024*1024), 2)
        }
        return result_human

    def cmd(self, command):
        status, res = self._milvus._cmd(command)
        logger.info("Server command: %s, result: %s" % (command, res))
        self.check_status(status)
        return res
Beispiel #14
0
class MilvusClient:
    def __init__(self, host, port, collection_name, vector_dim):
        self.client = Milvus(host, port)
        self.collection_name_prefix = collection_name
        self.vector_dim = vector_dim

    def save_vectors(self, vectors, botid):
        collection_name = self.decide_collection_name(botid)
        self.create_collection_if_need(collection_name)
        status, ids = self.client.insert(collection_name=collection_name,
                                         records=vectors)
        self.check_status(status)
        LogService.info('%d vectors saved into collection %s', len(vectors),
                        collection_name)
        return ids

    def search_vectors(self, vectors, botid):
        search_param = {'nprobe': 16}
        collection_name = self.decide_collection_name(botid)
        LogService.info('search vector in collection %s', collection_name)
        status, results = self.client.search(collection_name=collection_name,
                                             query_records=vectors,
                                             top_k=1,
                                             params=search_param)
        self.check_status(status)
        if len(results) > 0:
            vid = results[0][0].id
            LogService.info('vector found with id %d in collection %s', vid,
                            collection_name)
            return vid
        else:
            LogService.info('vector not found in collection %s',
                            collection_name)
            return None

    def empty_vectors(self, botid):
        collection_name = self.decide_collection_name(botid)
        status = self.client.drop_collection(collection_name)
        self.check_status(status)
        LogService.info('collection %s dropped', collection_name)

    def create_collection_if_need(self, collection_name):
        LogService.info('prepare collection %s', collection_name)
        status, exists = self.client.has_collection(collection_name)
        self.check_status(status)
        if not exists:
            LogService.info('collection %s not exists, create it.',
                            collection_name)
            create_param = {
                'collection_name': collection_name,
                'dimension': self.vector_dim,
                'index_file_size': 1024,
                'metric_type': MetricType.L2
            }
            status = self.client.create_collection(create_param)
            self.check_status(status)
            LogService.info('collection %s created', collection_name)

            status = self.client.create_index(collection_name,
                                              IndexType.IVF_FLAT,
                                              {'nlist': 16384})
            self.check_status(status)
            LogService.info('index for collection %s created', collection_name)
        else:
            LogService.info('collection %s already exists', collection_name)

    def drop_all_collections(self):
        LogService.info('drop all collections')
        status, collection_names = self.client.show_collections()
        self.check_status(status)
        if len(collection_names) > 0:
            for collection_name in collection_names:
                status = self.client.drop_collection(collection_name)
                self.check_status(status)
                LogService.info('%s dropped', collection_name)

    @classmethod
    def check_status(cls, status):
        if status.code != 0:
            raise Exception(status.message)

    def decide_collection_name(self, botid):
        return '%s%s' % (self.collection_name_prefix, botid)
Beispiel #15
0
def main():
    # Specify server addr when create milvus client instance
    # milvus client instance maintain a connection pool, param
    # `pool_size` specify the max connection num.
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_collection'

    ok = milvus.has_collection(collection_name)
    field_name = 'example_field'
    if not ok:
        fields = {
            "fields": [{
                "name": field_name,
                "type": DataType.FLOAT_VECTOR,
                "metric_type": "L2",
                "params": {
                    "dim": _DIM
                },
                "indexes": [{
                    "metric_type": "L2"
                }]
            }]
        }

        milvus.create_collection(collection_name=collection_name,
                                 fields=fields)
    else:
        milvus.drop_collection(collection_name=collection_name)

    # Show collections in Milvus server
    collections = milvus.list_collections()
    print(collections)

    # Describe demo_collection
    stats = milvus.get_collection_stats(collection_name)
    print(stats)

    # 10000 vectors with 128 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    print(vectors)
    # You can also use numpy to generate random vectors:
    #   vectors = np.random.rand(10000, _DIM).astype(np.float32)

    # Insert vectors into demo_collection, return status and vectors id list
    entities = [{
        "name": field_name,
        "type": DataType.FLOAT_VECTOR,
        "values": vectors
    }]

    res_ids = milvus.insert(collection_name=collection_name, entities=entities)
    print("ids:", res_ids)

    # Flush collection  inserted data to disk.
    milvus.flush([collection_name])

    # present collection statistics info
    stats = milvus.get_collection_stats(collection_name)
    print(stats)

    # create index of vectors, search more rapidly
    index_param = {
        "metric_type": "L2",
        "index_type": "IVF_FLAT",
        "params": {
            "nlist": 1024
        }
    }

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    status = milvus.create_index(collection_name, field_name, index_param)

    # execute vector similarity search

    print("Searching ... ")

    dsl = {
        "bool": {
            "must": [{
                "vector": {
                    field_name: {
                        "metric_type": "L2",
                        "query": vectors,
                        "topk": 10,
                        "params": {
                            "nprobe": 16
                        }
                    }
                }
            }]
        }
    }

    milvus.load_collection(collection_name)
    results = milvus.search(collection_name, dsl)
    # indicate search result
    # also use by:
    #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
    if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
        print('Query result is correct')
    else:
        print('Query result isn\'t correct')

    milvus.drop_index(collection_name, field_name)
    milvus.release_collection(collection_name)

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)
Beispiel #16
0
class MilvusDocumentStore(SQLDocumentStore):
    """
    Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
    Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
    In contrast to FAISS, Milvus ...
     - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
     - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
     - encapsulates multiple ANN libraries (FAISS, ANNOY ...)

    This class uses Milvus for all vector related storage, processing and querying.
    The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
    does not allow these data types (yet).

    Usage:
    1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md)
    2. Init a MilvusDocumentStore in Haystack
    """
    def __init__(
        self,
        sql_url: str = "sqlite:///",
        milvus_url: str = "tcp://localhost:19530",
        connection_pool: str = "SingletonThread",
        index: str = "document",
        vector_dim: int = 768,
        index_file_size: int = 1024,
        similarity: str = "dot_product",
        index_type: IndexType = IndexType.FLAT,
        index_param: Optional[Dict[str, Any]] = None,
        search_param: Optional[Dict[str, Any]] = None,
        update_existing_documents: bool = False,
        return_embedding: bool = False,
        embedding_field: str = "embedding",
        progress_bar: bool = True,
        **kwargs,
    ):
        """
        :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
                        deployment, Postgres is recommended. If using MySQL then same server can also be used for
                        Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md.
        :param milvus_url: Milvus server connection URL for storing and processing vectors.
                           Protocol, host and port will automatically be inferred from the URL.
                           See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance.
        :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread".
        :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
        :param vector_dim: The embedding vector size. Default: 768.
        :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
         When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
         Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
         As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
         Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
         (From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
                           'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
                           However, you can normalize your embeddings and use `dot_product` to get the same results.
                           See https://milvus.io/docs/v1.0.0/metric.md?Inner-product-(IP)#floating.
        :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
                           Some popular options:
                           - FLAT (default): Exact method, slow
                           - IVF_FLAT, inverted file based heuristic, fast
                           - HSNW: Graph based, fast
                           - ANNOY: Tree based, fast
                           See: https://milvus.io/docs/v1.0.0/index.md
        :param index_param: Configuration parameters for the chose index_type needed at indexing time.
                            For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
                            See https://milvus.io/docs/v1.0.0/index.md
        :param search_param: Configuration parameters for the chose index_type needed at query time
                             For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
                             See https://milvus.io/docs/v1.0.0/index.md
        :param update_existing_documents: Whether to update any existing documents with the same ID when adding
                                          documents. When set as True, any document with an existing ID gets updated.
                                          If set to False, an error is raised if the document ID of the document being
                                          added already exists.
        :param return_embedding: To return document embedding.
        :param embedding_field: Name of field containing an embedding vector.
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """
        self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
        self.vector_dim = vector_dim
        self.index_file_size = index_file_size

        if similarity == "dot_product":
            self.metric_type = MetricType.IP
            self.similarity = similarity
        else:
            raise ValueError(
                "The Milvus document store can currently only support dot_product similarity. "
                "Please set similarity=\"dot_product\"")

        self.index_type = index_type
        self.index_param = index_param or {"nlist": 16384}
        self.search_param = search_param or {"nprobe": 10}
        self.index = index
        self._create_collection_and_index_if_not_exist(self.index)
        self.return_embedding = return_embedding
        self.embedding_field = embedding_field
        self.progress_bar = progress_bar

        super().__init__(url=sql_url,
                         update_existing_documents=update_existing_documents,
                         index=index)

    def __del__(self):
        return self.milvus_server.close()

    def _create_collection_and_index_if_not_exist(
            self,
            index: Optional[str] = None,
            index_param: Optional[Dict[str, Any]] = None):
        index = index or self.index
        index_param = index_param or self.index_param

        status, ok = self.milvus_server.has_collection(collection_name=index)
        if not ok:
            collection_param = {
                'collection_name': index,
                'dimension': self.vector_dim,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type
            }

            status = self.milvus_server.create_collection(collection_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Collection creation on Milvus server failed: {status}')

            status = self.milvus_server.create_index(index, self.index_type,
                                                     index_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Index creation on Milvus server failed: {status}')

    def _create_document_field_map(self) -> Dict:
        return {
            self.index: self.embedding_field,
        }

    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                                  them right away in Milvus. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return:
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)
        field_map = self._create_document_field_map()

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.write_documents() with empty list")
            return

        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        batched_documents = get_batches_from_generator(document_objects,
                                                       batch_size)
        with tqdm(total=len(document_objects),
                  disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                vector_ids = []
                if add_vectors:
                    doc_ids = []
                    embeddings = []
                    for doc in document_batch:
                        doc_ids.append(doc.id)
                        if isinstance(doc.embedding, np.ndarray):
                            embeddings.append(doc.embedding.tolist())
                        elif isinstance(doc.embedding, list):
                            embeddings.append(doc.embedding)
                        else:
                            raise AttributeError(
                                f'Format of supplied document embedding {type(doc.embedding)} is not '
                                f'supported. Please use list or numpy.ndarray')

                    if self.update_existing_documents:
                        existing_docs = super().get_documents_by_id(
                            ids=doc_ids, index=index)
                        self._delete_vector_ids_from_milvus(
                            documents=existing_docs, index=index)

                    status, vector_ids = self.milvus_server.insert(
                        collection_name=index, records=embeddings)
                    if status.code != Status.SUCCESS:
                        raise RuntimeError(
                            f'Vector embedding insertion failed: {status}')

                docs_to_write_in_sql = []
                for idx, doc in enumerate(document_batch):
                    meta = doc.meta
                    if add_vectors:
                        meta["vector_id"] = vector_ids[idx]
                    docs_to_write_in_sql.append(doc)

                super().write_documents(docs_to_write_in_sql, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        if self.update_existing_documents:
            self.milvus_server.compact(collection_name=index)

    def update_embeddings(
        self,
        retriever: BaseRetriever,
        index: Optional[str] = None,
        batch_size: int = 10_000,
        update_existing_embeddings: bool = True,
        filters: Optional[Dict[str, List[str]]] = None,
    ):
Beispiel #17
0
#print(np.shape(np.array(vectors)))
#vector_ids = [1,2,3]
vectors = []
vector_ids = []
id_query = {}
with open("embed.txt", "r") as f:
    for line in f:
        line_lst = line.strip().split("\t")
        vectors.append(list(map(float, (line_lst[2].split()))))
        vector_ids.append(int(line_lst[1]))
        id_query[int(line_lst[1])] = line_lst[0]

milvus.insert(collection_name='test01', records=vectors, ids=vector_ids)

ivf_param = {'nlist': 16384}
milvus.create_index('test01', IndexType.IVF_FLAT, ivf_param)

search_param = {'nprobe': 16}
q_records = vectors[:1]
t = time.time()
result = milvus.search(collection_name='test01',
                       query_records=q_records,
                       top_k=10,
                       params=search_param)[1]
print(result)
print(time.time() - t)

t = time.time()
result = milvus.search(collection_name='test01',
                       query_records=q_records,
                       top_k=10,
Beispiel #18
0
class MilvusDocumentStore(SQLDocumentStore):
    """
    Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
    Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
    In contrast to FAISS, Milvus ...
     - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
     - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
     - encapsulates multiple ANN libraries (FAISS, ANNOY ...)

    This class uses Milvus for all vector related storage, processing and querying.
    The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
    does not allow these data types (yet).

    Usage:
    1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md)
    2. Init a MilvusDocumentStore in Haystack
    """
    def __init__(
        self,
        sql_url: str = "sqlite:///",
        milvus_url: str = "tcp://localhost:19530",
        connection_pool: str = "SingletonThread",
        index: str = "document",
        vector_dim: int = 768,
        index_file_size: int = 1024,
        similarity: str = "dot_product",
        index_type: IndexType = IndexType.FLAT,
        index_param: Optional[Dict[str, Any]] = None,
        search_param: Optional[Dict[str, Any]] = None,
        return_embedding: bool = False,
        embedding_field: str = "embedding",
        progress_bar: bool = True,
        duplicate_documents: str = 'overwrite',
        **kwargs,
    ):
        """
        :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
                        deployment, Postgres is recommended. If using MySQL then same server can also be used for
                        Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md.
        :param milvus_url: Milvus server connection URL for storing and processing vectors.
                           Protocol, host and port will automatically be inferred from the URL.
                           See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance.
        :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread".
        :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
        :param vector_dim: The embedding vector size. Default: 768.
        :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
         When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
         Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
         As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
         Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
         (From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
                           'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
                           However, you can normalize your embeddings and use `dot_product` to get the same results.
                           See https://milvus.io/docs/v1.0.0/metric.md?Inner-product-(IP)#floating.
        :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
                           Some popular options:
                           - FLAT (default): Exact method, slow
                           - IVF_FLAT, inverted file based heuristic, fast
                           - HSNW: Graph based, fast
                           - ANNOY: Tree based, fast
                           See: https://milvus.io/docs/v1.0.0/index.md
        :param index_param: Configuration parameters for the chose index_type needed at indexing time.
                            For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
                            See https://milvus.io/docs/v1.0.0/index.md
        :param search_param: Configuration parameters for the chose index_type needed at query time
                             For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
                             See https://milvus.io/docs/v1.0.0/index.md
        :param return_embedding: To return document embedding.
        :param embedding_field: Name of field containing an embedding vector.
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
                                    overwrite: Update any existing documents with the same ID when adding documents.
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.
        """

        # save init parameters to enable export of component config as YAML
        self.set_config(
            sql_url=sql_url,
            milvus_url=milvus_url,
            connection_pool=connection_pool,
            index=index,
            vector_dim=vector_dim,
            index_file_size=index_file_size,
            similarity=similarity,
            index_type=index_type,
            index_param=index_param,
            search_param=search_param,
            duplicate_documents=duplicate_documents,
            return_embedding=return_embedding,
            embedding_field=embedding_field,
            progress_bar=progress_bar,
        )

        self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
        self.vector_dim = vector_dim
        self.index_file_size = index_file_size

        if similarity == "dot_product":
            self.metric_type = MetricType.IP
            self.similarity = similarity
        elif similarity == "l2":
            self.metric_type = MetricType.L2
            self.similarity = similarity
        else:
            raise ValueError(
                "The Milvus document store can currently only support dot_product and L2 similarity. "
                "Please set similarity=\"dot_product\" or \"l2\"")

        self.index_type = index_type
        self.index_param = index_param or {"nlist": 16384}
        self.search_param = search_param or {"nprobe": 10}
        self.index = index
        self._create_collection_and_index_if_not_exist(self.index)
        self.return_embedding = return_embedding
        self.embedding_field = embedding_field
        self.progress_bar = progress_bar
        self.duplicate_documents = duplicate_documents

        super().__init__(url=sql_url, index=index)

    def __del__(self):
        return self.milvus_server.close()

    def _create_collection_and_index_if_not_exist(
            self,
            index: Optional[str] = None,
            index_param: Optional[Dict[str, Any]] = None):
        index = index or self.index
        index_param = index_param or self.index_param

        status, ok = self.milvus_server.has_collection(collection_name=index)
        if not ok:
            collection_param = {
                'collection_name': index,
                'dimension': self.vector_dim,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type
            }

            status = self.milvus_server.create_collection(collection_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Collection creation on Milvus server failed: {status}')

            status = self.milvus_server.create_index(index, self.index_type,
                                                     index_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Index creation on Milvus server failed: {status}')

    def _create_document_field_map(self) -> Dict:
        return {
            self.index: self.embedding_field,
        }

    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000,
                        duplicate_documents: Optional[str] = None,
                        index_param: Optional[Dict[str, Any]] = None):
Beispiel #19
0
    client = Milvus()

    # create collection: dimension is 128, metric type is L2
    gen_collection(client, collection_name, 128, MetricType.L2)

    # insert 10000 vectors into collection
    _, _, vectors = insert_data(client, collection_name, 128, 10000)

    # flush data into disk for persistent storage
    client.flush([collection_name])

    # specify index param
    index_param = {"m": 16, "nlist": 1024}

    # create `IVF_PQ` index
    status = client.create_index(collection_name, IndexType.IVF_PQ,
                                 index_param)
    if status.OK():
        print("Create index IVF_PQ successfully\n")
    else:
        print("Create index fail: ", status)

    # select top 10 vectors from inserted as query vectors
    query_vectors = vectors[:10]

    # specify search param
    search_param = {"nprobe": 10}

    # specify topk is 1, search approximate nearest 1 neighbor
    status, result = client.search(collection_name,
                                   2,
                                   query_vectors,
def main():
    # Specify server addr when create milvus client instance
    # milvus client instance maintain a connection pool, param
    # `pool_size` specify the max connection num.
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_collection_'

    status, ok = milvus.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': _INDEX_FILE_SIZE,  # optional
            'metric_type': MetricType.L2  # optional
        }

        milvus.create_collection(param)

    # Show collections in Milvus server
    _, collections = milvus.list_collections()

    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(collection)

    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = text2vec(index_sentences)
    print(vectors)

    # Insert vectors into demo_collection, return status and vectors id list
    status, ids = milvus.insert(collection_name=collection_name,
                                records=vectors)
    if not status.OK():
        print("Insert failed: {}".format(status))
    else:
        print(ids)
    #create a quick lookup table to easily access the indexed text/sentences given the ids
    look_up = {}
    for ID, sentences in zip(ids, index_sentences):
        look_up[ID] = sentences

    for k in look_up:
        print(k, look_up[k])

    # Flush collection  inserted data to disk.
    milvus.flush([collection_name])
    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)

    # present collection statistics info
    _, info = milvus.get_collection_stats(collection_name)
    print(info)

    # Obtain raw vectors by providing vector ids
    status, result_vectors = milvus.get_entity_by_id(collection_name, ids)

    # create index of vectors, search more rapidly
    index_param = {'nlist': 2048}

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)

    # describe index, get information of index
    status, index = milvus.get_index_info(collection_name)
    print(index)

    # Use the query sentences for similarity search
    query_vectors = text2vec(query_sentences)

    # execute vector similarity search
    search_param = {"nprobe": 16}

    print("Searching ... ")

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param,
    }

    status, results = milvus.search(**param)
    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

        # print results
        for res in results:
            for ele in res:
                print('id:{}, text:{}, distance: {}'.format(
                    ele.id, look_up[ele.id], ele.distance))

    else:
        print("Search failed. ", status)

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)
Beispiel #21
0
def main():
    # Specify server addr when create milvus client instance
    # milvus client instance maintain a connection pool, param
    # `pool_size` specify the max connection num.
    # 获取服务端的连接
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    # 创建collection
    collection_name = 'example_collection_'
    # 看是否有这个collection
    status, ok = milvus.has_collection(collection_name)
    # 如果没有则创建
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': _INDEX_FILE_SIZE,  # optional
            'metric_type': MetricType.L2  # optional
        }
        # 创建collection
        milvus.create_collection(param)

    # Show collections in Milvus server
    # 查看所有的collection
    _, collections = milvus.list_collections()
    print(collections)

    # Describe demo_collection
    # 得到当前的collection
    _, collection = milvus.get_collection_info(collection_name)
    print(collection)

    # 10000 vectors with 128 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    # 创建10个长度为8的向量
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    print(vectors)
    # You can also use numpy to generate random vectors:
    #   vectors = np.random.rand(10000, _DIM).astype(np.float32)

    # Insert vectors into demo_collection, return status and vectors id list
    # 把这10个向量都插入milvus
    status, ids = milvus.insert(collection_name=collection_name,
                                records=vectors)
    if not status.OK():
        print("Insert failed: {}".format(status))
    print(ids)

    # Flush collection  inserted data to disk.
    # 数据落盘
    milvus.flush([collection_name])
    # Get demo_collection row count
    # 得到当前row的数量
    status, result = milvus.count_entities(collection_name)
    print(status)
    print(result)

    # present collection statistics info
    # 查看collection的统计数据
    _, info = milvus.get_collection_stats(collection_name)
    print(info)

    # Obtain raw vectors by providing vector ids
    # 得到前十个数据
    status, result_vectors = milvus.get_entity_by_id(collection_name, ids[:10])
    print(result_vectors)

    # create index of vectors, search more rapidly
    # 创建索引
    index_param = {'nlist': 2048}

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    # 创建ivf_flat
    print("Creating index: {}".format(index_param))
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)

    # describe index, get information of index
    # 得到索引的信息
    status, index = milvus.get_index_info(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    # 对前10个数据进行query
    query_vectors = vectors[0:10]

    # execute vector similarity search
    # 索引的搜索的中心点数量
    search_param = {"nprobe": 16}

    print("Searching ... ")

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param,
    }
    # 进行搜索
    status, results = milvus.search(**param)
    if status.OK():
        print(results)
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

        # print results
        print(results)
    else:
        print("Search failed. ", status)

    # Delete demo_collection
    # 删除掉collection
    status = milvus.drop_collection(collection_name)
Beispiel #22
0
class MilvusClient(object):
    def __init__(self, collection_name=None, ip=None, port=None, timeout=60):
        self._collection_name = collection_name
        try:
            i = 1
            start_time = time.time()
            if not ip:
                self._milvus = Milvus(host=SERVER_HOST_DEFAULT,
                                      port=SERVER_PORT_DEFAULT)
            else:
                # retry connect for remote server
                while time.time() < start_time + timeout:
                    try:
                        self._milvus = Milvus(host=ip, port=port)
                        if self._milvus.server_status():
                            logger.debug(
                                "Try connect times: %d, %s" %
                                (i, round(time.time() - start_time, 2)))
                            break
                    except Exception as e:
                        logger.debug("Milvus connect failed")
                        i = i + 1

        except Exception as e:
            raise e

    def __str__(self):
        return 'Milvus collection %s' % self._collection_name

    def check_status(self, status):
        if not status.OK():
            logger.error(status.message)
            raise Exception("Status not ok")

    def check_result_ids(self, result):
        for index, item in enumerate(result):
            if item[0].distance >= epsilon:
                logger.error(index)
                logger.error(item[0].distance)
                raise Exception("Distance wrong")

    def create_collection(self, collection_name, dimension, index_file_size,
                          metric_type):
        if not self._collection_name:
            self._collection_name = collection_name
        if metric_type == "l2":
            metric_type = MetricType.L2
        elif metric_type == "ip":
            metric_type = MetricType.IP
        elif metric_type == "jaccard":
            metric_type = MetricType.JACCARD
        elif metric_type == "hamming":
            metric_type = MetricType.HAMMING
        elif metric_type == "sub":
            metric_type = MetricType.SUBSTRUCTURE
        elif metric_type == "super":
            metric_type = MetricType.SUPERSTRUCTURE
        else:
            logger.error("Not supported metric_type: %s" % metric_type)
        create_param = {
            'collection_name': collection_name,
            'dimension': dimension,
            'index_file_size': index_file_size,
            "metric_type": metric_type
        }
        status = self._milvus.create_collection(create_param)
        self.check_status(status)

    @time_wrapper
    def insert(self, X, ids=None):
        status, result = self._milvus.add_vectors(self._collection_name, X,
                                                  ids)
        self.check_status(status)
        return status, result

    @time_wrapper
    def delete_vectors(self, ids):
        status = self._milvus.delete_by_id(self._collection_name, ids)
        self.check_status(status)

    @time_wrapper
    def flush(self):
        status = self._milvus.flush([self._collection_name])
        self.check_status(status)

    @time_wrapper
    def compact(self):
        status = self._milvus.compact(self._collection_name)
        self.check_status(status)

    @time_wrapper
    def create_index(self, index_type, index_param=None):
        index_type = INDEX_MAP[index_type]
        logger.info(
            "Building index start, collection_name: %s, index_type: %s" %
            (self._collection_name, index_type))
        if index_param:
            logger.info(index_param)
        status = self._milvus.create_index(self._collection_name, index_type,
                                           index_param)
        self.check_status(status)

    def describe_index(self):
        status, result = self._milvus.describe_index(self._collection_name)
        self.check_status(status)
        index_type = None
        for k, v in INDEX_MAP.items():
            if result._index_type == v:
                index_type = k
                break
        return {"index_type": index_type, "index_param": result._params}

    def drop_index(self):
        logger.info("Drop index: %s" % self._collection_name)
        return self._milvus.drop_index(self._collection_name)

    @time_wrapper
    def query(self, X, top_k, search_param=None):
        status, result = self._milvus.search_vectors(self._collection_name,
                                                     top_k,
                                                     query_records=X,
                                                     params=search_param)
        self.check_status(status)
        return result

    @time_wrapper
    def query_ids(self, top_k, ids, search_param=None):
        status, result = self._milvus.search_by_ids(self._collection_name,
                                                    ids,
                                                    top_k,
                                                    params=search_param)
        self.check_result_ids(result)
        return result

    def count(self):
        return self._milvus.count_collection(self._collection_name)[1]

    def delete(self, timeout=120):
        timeout = int(timeout)
        logger.info("Start delete collection: %s" % self._collection_name)
        self._milvus.drop_collection(self._collection_name)
        i = 0
        while i < timeout:
            if self.count():
                time.sleep(1)
                i = i + 1
                continue
            else:
                break
        if i >= timeout:
            logger.error("Delete collection timeout")

    def describe(self):
        return self._milvus.describe_collection(self._collection_name)

    def show_collections(self):
        return self._milvus.show_collections()

    def exists_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self._collection_name
        status, res = self._milvus.has_collection(collection_name)
        # self.check_status(status)
        return res

    @time_wrapper
    def preload_collection(self):
        status = self._milvus.preload_collection(self._collection_name,
                                                 timeout=3000)
        self.check_status(status)
        return status

    def get_server_version(self):
        status, res = self._milvus.server_version()
        return res

    def get_server_mode(self):
        return self.cmd("mode")

    def get_server_commit(self):
        return self.cmd("build_commit_id")

    def get_server_config(self):
        return json.loads(self.cmd("get_config *"))

    def get_mem_info(self):
        result = json.loads(self.cmd("get_system_info"))
        result_human = {
            # unit: Gb
            "memory_used":
            round(int(result["memory_used"]) / (1024 * 1024 * 1024), 2)
        }
        return result_human

    def cmd(self, command):
        status, res = self._milvus._cmd(command)
        logger.info("Server command: %s, result: %s" % (command, res))
        self.check_status(status)
        return res
Beispiel #23
0
def main():
    milvus = Milvus(handler="HTTP")

    # Connect to Milvus server
    # You may need to change _HOST and _PORT accordingly
    param = {'host': _HOST, 'port': _PORT}
    status = milvus.connect(**param)
    if status.OK():
        print("Server connected.")
    else:
        print("Server connect fail.")
        sys.exit(1)

    # Create table demo_table if it dosen't exist.
    table_name = 'demo_tables'

    status, ok = milvus.has_table(table_name)
    if not ok:
        param = {
            'table_name': table_name,
            'dimension': _DIM,
            'index_file_size': _INDEX_FILE_SIZE,  # optional
            'metric_type': MetricType.L2  # optional
        }

        milvus.create_table(param)

    # Show tables in Milvus server
    _, tables = milvus.show_tables()

    # Describe demo_table
    _, table = milvus.describe_table(table_name)
    print(table)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(100000)]
    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32).tolist()`

    # Insert vectors into demo_table, return status and vectors id list
    status, ids = milvus.insert(table_name=table_name, records=vectors)

    # Wait for 6 seconds, until Milvus server persist vector data.
    time.sleep(6)

    # Get demo_table row count
    status, result = milvus.count_table(table_name)

    # create index of vectors, search more rapidly
    index_param = {
        'index_type': IndexType.IVFLAT,  # choice ivflat index
        'nlist': 2048
    }

    # Create ivflat index in demo_table
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    status = milvus.create_index(table_name, index_param)

    # describe index, get information of index
    status, index = milvus.describe_index(table_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search
    param = {
        'table_name': table_name,
        'query_records': query_vectors,
        'top_k': 1,
        'nprobe': 16
    }
    status, results = milvus.search(**param)

    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

    # print results
    print(results)

    # Delete demo_table
    status = milvus.drop_table(table_name)

    # Disconnect from Milvus
    status = milvus.disconnect()
Beispiel #24
0
class MilvusHelper(BaseVectorSimilarityHelper):
    def __init__(self, _server_url, _server_port, _timeout=10):
        super().__init__()
        self.server_url = _server_url
        self.server_port = _server_port
        self.timeout = _timeout
        self.client = None
        self.metric_type_mapper = {
            VectorMetricType.L2: MetricType.L2,
            VectorMetricType.IP: MetricType.IP,
            VectorMetricType.JACCARD: MetricType.JACCARD,
            VectorMetricType.HAMMING: MetricType.HAMMING,
        }

        self.index_type_mapper = {
            VectorIndexType.FLAT: IndexType.FLAT,
            VectorIndexType.IVFLAT: IndexType.IVFLAT,
            VectorIndexType.IVF_SQ8: IndexType.IVF_SQ8,
            VectorIndexType.RNSG: IndexType.RNSG,
            VectorIndexType.IVF_SQ8H: IndexType.IVF_SQ8H,
            VectorIndexType.IVF_PQ: IndexType.IVF_PQ,
            VectorIndexType.HNSW: IndexType.HNSW,
            VectorIndexType.ANNOY: IndexType.ANNOY,
        }

    def init(self):
        if self.client is None:
            if not (self.server_url is None or self.server_url is None):
                try:
                    self.client = Milvus(host=self.server_url, port=self.server_port)
                except:
                    raise MilvusRuntimeException(f'cannot connect to {self.server_url}:{self.server_port}')
            else:
                raise MilvusRuntimeException('Milvus config is not correct')

    def insert(self, _database_name, _to_insert_vector, _partition_tag=None, _params=None):
        """
        向数据库中插入一系列的特征向量

        notes:如果用户有自己的id,建议使用insert_with_id函数

        ATTENTION!!!

        一个库中不能既调用insert_with_id还调用insert,只能调用一种,否则会报错

        Args:
            _database_name:     数据库名称
            _to_insert_vector:  待插入的特征向量的列表
            _partition_tag:     分区标签
            _params:    插入参数

        Returns:    插入后的id

        """
        self.init()
        status, ids = self.client.insert(_database_name, _to_insert_vector, partition_tag=_partition_tag,
                                         params=_params,
                                         timeout=self.timeout)
        self.flush(_database_name)
        if status.OK():
            return ids
        else:
            raise MilvusRuntimeException(status.message)

    def insert_with_id(self, _database_name, _to_insert_vector, _to_insert_ids, _partition_tag=None, _params=None):
        """
        向数据库中插入一系列的有固定id的特征向量

        ATTENTION!!!

        一个库中不能既调用insert_with_id还调用insert,只能调用一种,否则会报错

        Args:
            _database_name:     数据库名称
            _to_insert_vector:  待插入的特征向量的列表
            _to_insert_ids:  待插入的特征向量的id的列表,每个元素必须为正整数,且不越界
            _partition_tag:     分区标签
            _params:    插入参数

        Returns:    插入后的id

        """
        self.init()
        status, ids = self.client.insert(_database_name, _to_insert_vector,
                                         ids=_to_insert_ids, partition_tag=_partition_tag,
                                         params=_params,
                                         timeout=self.timeout)
        self.flush(_database_name)
        if status.OK():
            return ids
        else:
            raise MilvusRuntimeException(status.message)

    def delete(self, _database_name, _to_delete_ids):
        """
        删除特定id

        Args:
            _database_name:     数据库名称
            _to_delete_ids:     待删除的id

        Returns:    是否删除成功

        """
        self.init()
        status = self.client.delete_entity_by_id(_database_name, _to_delete_ids, self.timeout)
        self.flush(_database_name)
        if status.OK():
            return True
        else:
            raise MilvusRuntimeException(status.message)

    def database_exist(self, _database_name):
        """
        数据库是否存在

        Args:
            _database_name:     数据库名称

        Returns:    是否存在

        """
        self.init()
        status, is_exist = self.client.has_collection(_database_name, self.timeout)
        if status.OK():
            return is_exist
        else:
            raise MilvusRuntimeException(status.message)

    def create_database(self, _database_name, _dimension, _index_file_size, _metric_type):
        """
        创建数据库

        Args:
            _database_name:     数据库名称
            _dimension:     特征向量维度
            _index_file_size:   index的文件大小
            _metric_type:   度量类型

        Returns:    是否创建成功

        """
        self.init()
        if not self.database_exist(_database_name):
            assert _metric_type in self.metric_type_mapper, f'{_metric_type} not support in milvus'
            status = self.client.create_collection({
                'collection_name': _database_name,
                'dimension': _dimension,
                'index_file_size': _index_file_size,
                'metric_type': self.metric_type_mapper[_metric_type]
            })
            if status.OK():
                return True
            else:
                raise MilvusRuntimeException(status.message)
        else:
            return True

    def create_index(self, _database_name, _index_type):
        """
        创建index(索引)

        Args:
            _database_name:     数据库名称
            _index_type:    index类型

        Returns:    是否创建成功

        """
        self.init()
        if self.database_exist(_database_name):
            assert _index_type in self.index_type_mapper, f'{_index_type} not support in milvus'
            status = self.client.create_index(_database_name, self.index_type_mapper[_index_type], timeout=self.timeout)
            if status.OK():
                return True
            else:
                raise MilvusRuntimeException(status.message)

    def search(self, _database_name, _query_vector_list, _top_k, _partition_tag=None, _params=None):
        """
        检索用参数

        Args:
            _database_name:     数据库名称
            _query_vector_list:      检索用的特征向量列表
            _top_k:     top k
            _partition_tag:     分区标签
            _params:    检索参数

        Returns:    检索的结果,包含id和distance

        """
        self.init()
        if self.database_exist(_database_name):
            status, search_result = self.client.search(_database_name, _top_k, _query_vector_list,
                                                       partition_tags=_partition_tag,
                                                       params=_params,
                                                       timeout=self.timeout)
            if status.OK():
                return search_result
            else:
                raise MilvusRuntimeException(status.message)
        else:
            raise DatabaseNotExist(f'{_database_name} not exist')

    def flush(self, _database_name):
        """
        sink数据库

        Args:
            _database_name:     数据库名称

        Returns:    是否flush成功

        """
        self.init()
        status = self.client.flush([_database_name, ], self.timeout)
        if status.OK():
            return True
        else:
            raise MilvusRuntimeException(status.message)

    def __del__(self):
        if self.client is not None:
            self.client.close()
Beispiel #25
0
    gen_collection(client, collection_name, 128, MetricType.L2)

    # insert 10000 vectors into collection
    _, _, vectors = insert_data(client, collection_name, 128, 100000)

    # flush data into disk for persistent storage
    client.flush([collection_name])

    # specify index param
    index_param = {
        "n_trees": 50,
    }

    # create `ANNOY` index
    print("Start create index ......")
    status = client.create_index(collection_name, IndexType.ANNOY, index_param)
    if status.OK():
        print("Create index ANNOY successfully\n")
    else:
        print("Create index ANNOY fail")

    # select top 10 vectors from inserted as query vectors
    query_vectors = vectors[:10]

    # specify search param
    search_param = {"search_k": 10}

    # specify topk is 1, search approximate nearest 1 neighbor
    status, result = client.search(collection_name,
                                   1,
                                   query_vectors,
def main():
    # Connect to Milvus server
    # You may need to change _HOST and _PORT accordingly
    param = {'host': _HOST, 'port': _PORT}

    # You can create a instance specified server addr and
    # invoke rpc method directly
    client = Milvus(**param)
    # Create collection demo_collection if it dosen't exist.
    collection_name = 'demo_partition_collection'
    partition_tag = "random"

    status, ok = client.has_collection(collection_name)
    # if collection exists, then drop it
    if status.OK() and ok:
        client.drop_collection(collection_name)

    param = {
        'collection_name': collection_name,
        'dimension': _DIM,
        'index_file_size': _INDEX_FILE_SIZE,  # optional
        'metric_type': MetricType.L2  # optional
    }

    client.create_collection(param)

    # Show collections in Milvus server
    _, collections = client.show_collections()

    # Describe collection
    _, collection = client.describe_collection(collection_name)
    print(collection)

    # create partition
    client.create_partition(collection_name, partition_tag=partition_tag)
    # display partitions
    _, partitions = client.show_partitions(collection_name)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10000)]
    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32).tolist()`

    # Insert vectors into partition of collection, return status and vectors id list
    status, ids = client.insert(collection_name=collection_name, records=vectors, partition_tag=partition_tag)

    # Wait for 6 seconds, until Milvus server persist vector data.
    time.sleep(6)

    # Get demo_collection row count
    status, num = client.count_collection(collection_name)

    # create index of vectors, search more rapidly
    index_param = {
        'nlist': 2048
    }

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    status = client.create_index(collection_name, IndexType.IVF_FLAT, index_param)

    # describe index, get information of index
    status, index = client.describe_index(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search, search range in partition `partition1`
    search_param = {
        "nprobe": 10
    }

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'partition_tags': ["random"],
        'params': search_param
    }
    status, results = client.search(**param)

    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

    # print results
    print(results)

    # Delete partition. You can also invoke `drop_collection()`, so that all of partitions belongs to
    # designated collections will be deleted.
    # status = client.drop_partition(collection_name, partition_tag)

    # Delete collection. All of partitions of this collection will be dropped.
    status = client.drop_collection(collection_name)
Beispiel #27
0
def main():
    milvus = Milvus()

    # Connect to Milvus server
    # You may need to change _HOST and _PORT accordingly
    param = {'host': _HOST, 'port': _PORT}
    status = milvus.connect(**param)
    if status.OK():
        print("Server connected.")
    else:
        print("Server connect fail.")
        sys.exit(1)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_collection'

    status, ok = milvus.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': _INDEX_FILE_SIZE,  # optional
            'metric_type': MetricType.L2  # optional
        }

        milvus.create_collection(param)

    # Show collections in Milvus server
    _, collections = milvus.show_collections()

    # present collection info
    _, info = milvus.collection_info(collection_name)
    print(info)

    # Describe demo_collection
    _, collection = milvus.describe_collection(collection_name)
    print(collection)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10000)]
    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32)`

    # Insert vectors into demo_collection, return status and vectors id list
    status, ids = milvus.insert(collection_name=collection_name, records=vectors)

    # Flush collection  inserted data to disk.
    milvus.flush([collection_name])

    # Get demo_collection row count
    status, result = milvus.count_collection(collection_name)

    # create index of vectors, search more rapidly
    index_param = {
        'nlist': 2048
    }

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT, index_param)

    # describe index, get information of index
    status, index = milvus.describe_index(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search
    search_param = {
        "nprobe": 16
    }
    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param
    }
    print("Searching ... ")
    status, results = milvus.search(**param)

    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

    # print results
    print(results)

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)

    # Disconnect from Milvus
    status = milvus.disconnect()
Beispiel #28
0
class MilvusDBHandler:
    """Milvus DB handler
        This class is intended to abstract the access and communication with external MilvusDB from Executors

        For more information about Milvus:
            - https://github.com/milvus-io/milvus/
    """

    @staticmethod
    def get_index_type(index_type):
        from milvus import IndexType

        return {
            'Flat': IndexType.FLAT,
            'IVF,Flat': IndexType.IVFLAT,
            'IVF,SQ8': IndexType.IVF_SQ8,
            'RNSG': IndexType.RNSG,
            'IVF,SQ8H': IndexType.IVF_SQ8H,
            'IVF,PQ': IndexType.IVF_PQ,
            'HNSW': IndexType.IVF_PQ,
            'Annoy': IndexType.ANNOY
        }.get(index_type, IndexType.FLAT)

    class MilvusDBInserter:
        """Milvus DB Inserter
            This class is an inner class and provides a context manager to insert vectors into Milvus while ensuring
            data is flushed.

            For more information about Milvus:
                - https://github.com/milvus-io/milvus/
        """

        def __init__(self, client, collection_name: str):
            self.logger = get_logger(self.__class__.__name__)
            self.client = client
            self.collection_name = collection_name

        def __enter__(self):
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            self.logger.info(f'Sending flush command to Milvus Server for collection: {self.collection_name}')
            self.client.flush([self.collection_name])

        def insert(self, keys: list, vectors: 'np.ndarray'):
            status, _ = self.client.insert(collection_name=self.collection_name, records=vectors, ids=keys)
            if not status.OK():
                self.logger.error('Insert failed: {}'.format(status))
                raise MilvusDBException(status.message)

    def __init__(self, host: str, port: int, collection_name: str):
        """
        Initialize an MilvusDBHandler

        :param host: Host of the Milvus Server
        :param port: Port to connect to the Milvus Server
        :param collection_name: Name of the collection where the Handler will insert and query vectors.
        """
        self.logger = get_logger(self.__class__.__name__)
        self.host = host
        self.port = str(port)
        self.collection_name = collection_name
        self.milvus_client = None

    def __enter__(self):
        return self.connect()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def connect(self):
        from milvus import Milvus
        if self.milvus_client is None or not self.milvus_client.server_status()[0].OK():
            self.logger.info(f'Setting connection to Milvus Server at {self.host}:{self.port}')
            self.milvus_client = Milvus(self.host, self.port)
        return self

    def close(self):
        self.logger.info(f'Closing connection to Milvus Server at {self.host}:{self.port}')
        self.milvus_client.close()

    def insert(self, keys: 'np.ndarray', vectors: 'np.ndarray'):
        with MilvusDBHandler.MilvusDBInserter(self.milvus_client, self.collection_name) as db:
            db.insert(reduce(operator.concat, keys.tolist()), vectors)

    def build_index(self, index_type: str, index_params: dict):
        type = self.get_index_type(index_type)

        self.logger.info(f'Creating index of type: {index_type} at'
                         f' Milvus Server. collection: {self.collection_name} with index params: {index_params}')
        status = self.milvus_client.create_index(self.collection_name, type, index_params)
        if not status.OK():
            self.logger.error('Creating index failed: {}'.format(status))
            raise MilvusDBException(status.message)

    def search(self, query_vectors: 'np.ndarray', top_k: int, search_params: dict = None):
        self.logger.info(f'Querying collection: {self.collection_name} with search params: {search_params}')
        status, results = self.milvus_client.search(collection_name=self.collection_name,
                                                    query_records=query_vectors, top_k=top_k, params=search_params)
        if not status.OK():
            self.logger.error('Querying index failed: {}'.format(status))
            raise MilvusDBException(status.message)
        else:
            return results.distance_array, results.id_array
Beispiel #29
0
        'table_name': table,
        'dimension': dim,
        'metric_type': MetricType.L2,
        'index_file_size': 10
    }
    status = connect.create_table(param)
    print(status)
    print(connect.get_table_row_count(table))
    # add vectors
    for i in range(10):
        status, ids = connect.add_vectors(table, vectors)
        print(status)
        print(ids[0])
    # print(ids[0])
    index_params = {"index_type": IndexType.IVFLAT, "nlist": 16384}
    status = connect.create_index(table, index_params)
    print(status)
    # sys.exit()
    query_vec = [vectors[0]]
    # print(numpy.inner(numpy.array(query_vec[0]), numpy.array(query_vec[0])))
    top_k = 12
    nprobe = 1
    for i in range(2):
        result = connect.search_vectors(table, top_k, nprobe, query_vec)
        print(result)
    sys.exit()

    table = gen_unique_str("test_add_vector_with_multiprocessing")
    uri = "tcp://%s:%s" % (args["ip"], args["port"])
    param = {
        'table_name': table,
Beispiel #30
0
def main():
    milvus = Milvus(_HOST, _PORT)

    # num = random.randint(1, 100000)
    num = 100000
    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_hybrid_collections_{}'.format(num)
    if milvus.has_collection(collection_name):
        milvus.drop_collection(collection_name)

    collection_param = {
        "fields": [{
            "field": "A",
            "type": DataType.INT32
        }, {
            "field": "B",
            "type": DataType.INT32
        }, {
            "field": "C",
            "type": DataType.INT64
        }, {
            "field": "Vec",
            "type": DataType.FLOAT_VECTOR,
            "params": {
                "dim": 128,
                "metric_type": "L2"
            }
        }],
        "segment_size":
        100
    }
    milvus.create_collection(collection_name, collection_param)

    milvus.compact(collection_name)

    # milvus.create_partition(collection_name, "p_01", timeout=1800)
    # pars = milvus.list_partitions(collection_name)
    # ok = milvus.has_partition(collection_name, "p_01", timeout=1800)
    # assert ok
    # ok = milvus.has_partition(collection_name, "p_02")
    # assert not ok
    # for p in pars:
    #     if p == "_default":
    #         continue
    #     milvus.drop_partition(collection_name, p)

    # milvus.drop_collection(collection_name)
    # sys.exit(0)

    A_list = [random.randint(0, 255) for _ in range(num)]
    vec = [[random.random() for _ in range(128)] for _ in range(num)]
    hybrid_entities = [{
        "field": "A",
        "values": A_list,
        "type": DataType.INT32
    }, {
        "field": "B",
        "values": A_list,
        "type": DataType.INT32
    }, {
        "field": "C",
        "values": A_list,
        "type": DataType.INT64
    }, {
        "field": "Vec",
        "values": vec,
        "type": DataType.FLOAT_VECTOR,
        "params": {
            "dim": 128
        }
    }]

    for slice_e in utils.entities_slice(hybrid_entities):
        ids = milvus.insert(collection_name, slice_e)
    milvus.flush([collection_name])
    print("Flush ... ")
    # time.sleep(3)
    count = milvus.count_entities(collection_name)

    milvus.delete_entity_by_id(collection_name, ids[:1])
    milvus.flush([collection_name])
    print("Get entity be id start ...... ")
    entities = milvus.get_entity_by_id(collection_name, ids[:1])
    et = entities.dict()
    milvus.delete_entity_by_id(collection_name, ids[1:2])
    milvus.flush([collection_name])

    print("Create index ......")
    milvus.create_index(collection_name, "Vec", {
        "index_type": "IVF_FLAT",
        "metric_type": "L2",
        "params": {
            "nlist": 100
        }
    })
    print("Create index done.")

    info = milvus.get_collection_info(collection_name)
    print(info)
    stats = milvus.get_collection_stats(collection_name)
    print("\nstats\n")
    print(stats)
    query_hybrid = \
    {
        "bool": {
            "must": [
                {
                    "term": {
                        "A": [1, 2, 5]
                    }
                },
                {
                    "range": {
                        "B": {"GT": 1, "LT": 100}
                    }
                },
                {
                    "vector": {
                        "Vec": {
                            "topk": 10, "query": vec[: 10000], "params": {"nprobe": 10}
                        }
                    }
                }
            ],
        },
    }

    # print("Start searach ..", flush=True)
    # results = milvus.search(collection_name, query_hybrid)
    # print(results)
    #
    # for r in list(results):
    #     print("ids", r.ids)
    #     print("distances", r.distances)

    t0 = time.time()
    count = 0
    results = milvus.search(collection_name, query_hybrid, fields=["B"])
    for r in list(results):
        # print("ids", r.ids)
        # print("distances", r.distances)
        for rr in r:
            count += 1
            # print(rr.entity.get("B"))

    print("Search cost {} s".format(time.time() - t0))

    # for result in results:
    #     for r in result:
    #         print(f"{r}")

    # itertor entity id
    # for result in results:
    #     for r in result:
    #         # get distance
    #         dis = r.distance
    #         id_ = r.id
    #         # obtain all field name
    #         fields = r.entity.fields
    #         for f in fields:
    #             # get field value by field name
    #             # fv = r.entity.
    #             fv = r.entity.value_of_field(f)
    #             print(fv)

    milvus.drop_collection(collection_name)