コード例 #1
0
ファイル: example.py プロジェクト: qianchen94/pymilvus
def main():
    # Specify server addr when create milvus client instance
    # milvus client instance maintain a connection pool, param
    # `pool_size` specify the max connection num.
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_collection_'

    ok = milvus.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': _INDEX_FILE_SIZE,  # optional
            'metric_type': MetricType.L2  # optional
        }

        milvus.create_collection(param)

    # Show collections in Milvus server
    _, collections = milvus.list_collections()

    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(collection)

    # 10000 vectors with 128 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    print(vectors)
    # You can also use numpy to generate random vectors:
    #   vectors = np.random.rand(10000, _DIM).astype(np.float32)

    # Insert vectors into demo_collection, return status and vectors id list
    status, ids = milvus.insert(collection_name=collection_name,
                                records=vectors)
    if not status.OK():
        print("Insert failed: {}".format(status))

    # Flush collection  inserted data to disk.
    milvus.flush([collection_name])
    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)

    # present collection statistics info
    _, info = milvus.get_collection_stats(collection_name)
    print(info)

    # Obtain raw vectors by providing vector ids
    status, result_vectors = milvus.get_entity_by_id(collection_name, ids[:10])

    # create index of vectors, search more rapidly
    index_param = {'nlist': 2048}

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)

    # describe index, get information of index
    status, index = milvus.get_index_info(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search
    search_param = {"nprobe": 16}

    print("Searching ... ")

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param,
    }

    status, results = milvus.search(**param)
    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

        # print results
        print(results)
    else:
        print("Search failed. ", status)

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)
コード例 #2
0
#建立分区
milvus.create_partition('test01', 'tag01')

#nlist聚类为多少簇
ivf_param = {'nlist': 16}
milvus.create_index('test01', IndexType.IVF_FLAT, ivf_param) #可以不写这个,默认方式IndexType.FLAT
# 随机生成一批向量数据
vectors_array = np.random.rand(num_vec,vec_dim)
vectors_list = vectors_array.tolist()
vectors_list.append([1 for _ in range(vec_dim)])
ids_list = [i for i in range(len(vectors_list))]
#单次插入的数据量不能大于 256 MB,插入后存在缓存区,缓存区大小由参数index_file_size决定,默认1024M
milvus.insert(collection_name='test01', records=vectors_list, partition_tag="tag01",ids=ids_list)
#一些信息
print(milvus.list_collections())
print(milvus.get_collection_info('test01'))
print(milvus.get_index_info('test01'))
t2 = time.time()
print("create cost:",t2-t1)
# # 创建查询向量
query_vec_list = [[1 for _ in range(vec_dim)]]
# 进行查询, 注意这里的参数nprobe和建立索引时的参数nlist 会因为索引类型不同而影响到查询性能和查询准确率
#IVF_FLAT下查询多少个簇,不能超过nlist
search_param = {'nprobe': 10}
#现在数据还在内存,需要数据落盘,保存到数据库中去,不然查不到数据
milvus.flush(collection_name_array=['test01'])
results = milvus.search(collection_name='test01', query_records=query_vec_list, top_k=10,params=search_param)
print(results)
print("search cost:",time.time()-t2)
#
# sudo docker run -d --name milvus_cpu \
コード例 #3
0
def main():
    milvus = Milvus(uri=uri)
    param = {
        'collection_name': collection_name,
        'dimension': _DIM,
        'index_file_size': 32,
        #'metric_type': MetricType.IP
        'metric_type': MetricType.L2
    }
    # show collections in Milvus server
    _, collections = milvus.list_collections()

    # 创建 collection
    milvus.create_collection(param)
    # 创建 collection partion
    milvus.create_partition(collection_name, partition_tag)

    print(f'collections in Milvus: {collections}')
    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(f'descript demo_collection: {collection}')

    # build fake vectors
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    vectors1 = [[random.random() for _ in range(_DIM)] for _ in range(10)]

    status, id = milvus.insert(collection_name=collection_name,
                               records=vectors,
                               ids=list(range(10)),
                               partition_tag=partition_tag)
    print(f'status: {status} | id: {id}')
    if not status.OK():
        print(f"insert failded: {status}")

    status1, id1 = milvus.insert(collection_name=collection_name,
                                 records=vectors1,
                                 ids=list(range(10, 20)),
                                 partition_tag=partition_tag)
    print(f'status1: {status1} | id1: {id1}')

    ids_deleted = list(range(10))

    status_delete = milvus.delete_entity_by_id(collection_name=collection_name,
                                               id_array=ids_deleted)
    if status_delete.OK():
        print(f'delete successful')

    # Flush collection insered data to disk
    milvus.flush([collection_name])
    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)
    print(f"demo_collection row count: {result}")

    # Obtain raw vectors by providing vector ids
    status, result_vectors = milvus.get_entity_by_id(collection_name,
                                                     list(range(10, 20)))

    # create index of vectors, search more repidly
    index_param = {'nlist': 2}

    # create ivflat index in demo_collection
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)
    if status.OK():
        print(f"create index ivf_flat succeeed")

    # use the top 10 vectors for similarity search
    query_vectors = vectors1[0:2]

    # execute vector similariy search
    search_param = {"nprobe": 16}

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param
    }

    status, results = milvus.search(**param)
    if status.OK():
        if results[0][0].distance == 0.0:
            print('query result is correct')
        else:
            print('not correct')
        print(results)
    else:
        print(f'search failed: {status}')

    # 清除已经存在的collection
    milvus.drop_collection(collection_name=collection_name)

    milvus.close()
コード例 #4
0
ファイル: to_milvus.py プロジェクト: zhoucc/PaddleRec
            "params": {
                "dim": 32
            }
        },
    ],
    "segment_row_limit":
    16384,
    "auto_id":
    False
}

client.create_collection(collection_name, collection_param)
client.create_partition(collection_name, "Movie")

print("--------get collection info--------")
collection = client.get_collection_info(collection_name)
pprint(collection)
partitions = client.list_partitions(collection_name)
print("\n----------list partitions----------")
pprint(partitions)

import codecs
with codecs.open("movie_vectors.txt", "r", encoding='utf-8',
                 errors='ignore') as f:
    lines = f.readlines()

ids = [int(line.split(":")[0]) for line in lines]
embeddings = []
for line in lines:
    line = line.strip().split(":")[1][1:-1]
    str_nums = line.split(",")
コード例 #5
0
def main():
    # Specify server addr when create milvus client instance
    milvus = Milvus(_HOST, _PORT)

    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_async_collection_'

    status, ok = milvus.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': _DIM,
            'index_file_size': 128,  # optional
            'metric_type': MetricType.L2  # optional
        }

        status = milvus.create_collection(param)
        if not status.OK():
            print("Create collection failed: {}".format(status.message), file=sys.stderr)
            print("exiting ...", file=sys.stderr)
            sys.exit(1)

    # Show collections in Milvus server
    _, collections = milvus.list_collections()

    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(collection)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(100000)]

    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32)`

    def _insert_callback(status, ids):
        if status.OK():
            print("Insert successfully")
        else:
            print("Insert failed.", status.message)

    # Insert vectors into demo_collection, adding callback function
    insert_future = milvus.insert(collection_name=collection_name, records=vectors, _async=True,
                                  _callback=_insert_callback)
    # Or invoke result() to get results:
    #   insert_future = milvus.insert(collection_name=collection_name, records=vectors, _async=True)
    #   status, ids = insert_future.result()

    insert_future.done()

    # Flush collection  inserted data to disk.
    def _flush_callback(status):
        if status.OK():
            print("Flush successfully")
        else:
            print("Flush failed.", status.message)

    flush_future = milvus.flush([collection_name], _async=True, _callback=_flush_callback)
    # Or invoke result() to get results:
    #   flush_future = milvus.flush([collection_name], _async=True)
    #   status = flush_future.result()
    flush_future.done()

    def _compact_callback(status):
        if status.OK():
            print("Compact successfully")
        else:
            print("Compact failed.", status.message)

    compact_furure = milvus.compact(collection_name, _async=True, _cakkback=_compact_callback)
    # Or invoke result() to get results:
    #   compact_future = milvus.compact(collection_name, _async=True)
    #   status = compact_future.result()
    compact_furure.done()

    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)

    # present collection info
    _, info = milvus.get_collection_stats(collection_name)
    print(info)

    # create index of vectors, search more rapidly
    index_param = {
        'nlist': 2048
    }

    def _index_callback(status):
        if status.OK():
            print("Create index successfully")
        else:
            print("Create index failed.", status.message)

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    print("Creating index: {}".format(index_param))
    index_future = milvus.create_index(collection_name, IndexType.IVF_FLAT, index_param, _async=True,
                                       _callback=_index_callback)
    # Or invoke result() to get results:
    #   index_future = milvus.create_index(collection_name, IndexType.IVF_FLAT, index_param, _async=True)
    #   status = index_future.result()
    index_future.done()

    # describe index, get information of index
    status, index = milvus.get_index_info(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search
    search_param = {
        "nprobe": 16
    }

    print("Searching ... ")

    def _search_callback(status, results):
        # if status.OK():
        #     print("Search successfully")
        # else:
        #     print("Search failed.", status.message)
        if status.OK():
            # indicate search result
            # also use by:
            #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
            if results[0][0].distance == 0.0:  # or results[0][0].id == ids[0]:
                print('Query result is correct')
            else:
                print('Query result isn\'t correct')

            # print results
            print(results)
        else:
            print("Search failed. ", status)

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param,
        "_async": True,
        "_callback": _search_callback
    }
    search_future = milvus.search(**param)
    # Or invoke result() to get results:
    #
    #   param = {
    #       'collection_name': collection_name,
    #       'query_records': query_vectors,
    #       'top_k': 1,
    #       'params': search_param,
    #       "_async": True,
    #   }
    #   search_future = milvus.search(param)
    #   status, results = index_future.result()

    search_future.done()

    # Delete demo_collection
    status = milvus.drop_collection(collection_name)
コード例 #6
0
def main():
    milvus = Milvus(_HOST, _PORT)

    num = random.randint(1, 100)
    # Create collection demo_collection if it dosen't exist.
    collection_name = 'example_hybrid_collection_{}'.format(num)

    collection_param = {
        "fields": [{
            "field": "A",
            "type": DataType.INT32
        }, {
            "field": "B",
            "type": DataType.INT32
        }, {
            "field": "C",
            "type": DataType.INT64
        }, {
            "field": "Vec",
            "type": DataType.FLOAT_VECTOR,
            "params": {
                "dimension": 128,
                "metric_type": "L2"
            }
        }],
        "segment_size":
        100
    }
    milvus.create_collection(collection_name, collection_param)

    # milvus.create_partition(collection_name, "p_01", timeout=1800)
    # pars = milvus.list_partitions(collection_name)
    # ok = milvus.has_partition(collection_name, "p_01", timeout=1800)
    # assert ok
    # ok = milvus.has_partition(collection_name, "p_02")
    # assert not ok
    # for p in pars:
    #     if p == "_default":
    #         continue
    #     milvus.drop_partition(collection_name, p)

    # milvus.drop_collection(collection_name)
    # sys.exit(0)

    A_list = [random.randint(0, 255) for _ in range(num)]
    vec = [[random.random() for _ in range(128)] for _ in range(num)]
    hybrid_entities = [{
        "field": "A",
        "values": A_list,
        "type": DataType.INT32
    }, {
        "field": "B",
        "values": A_list,
        "type": DataType.INT32
    }, {
        "field": "C",
        "values": A_list,
        "type": DataType.INT64
    }, {
        "field": "Vec",
        "values": vec,
        "type": DataType.FLOAT_VECTOR
    }]

    ids = milvus.insert(collection_name, hybrid_entities)
    milvus.flush([collection_name])
    print("Flush ... ")
    time.sleep(3)

    milvus.delete_entity_by_id(collection_name, ids[:10])

    # print("Get entity be id start ...... ")
    # entities = milvus.get_entity_by_id(collection_name, ids[:1])
    # et = entities.dict()

    print("Create index ......")
    milvus.create_index(collection_name, "Vec", "ivf_flat", {
        "index_type": "IVF_FLAT",
        "nlist": 100
    })
    print("Create index done.")

    info = milvus.get_collection_info(collection_name)
    query_hybrid = {
        "bool": {
            "must": [{
                "term": {
                    "A": [1, 2, 5]
                }
            }, {
                "range": {
                    "B": {
                        "GT": 1,
                        "LT": 100
                    }
                }
            }, {
                "vector": {
                    "Vec": {
                        "topk": 10,
                        "query": vec[:1],
                        "params": {
                            "nprobe": 10
                        }
                    }
                }
            }],
        },
    }

    # print("Start searach ..", flush=True)
    # results = milvus.search(collection_name, query_hybrid)
    # print(results)
    #
    # for r in list(results):
    #     print("ids", r.ids)
    #     print("distances", r.distances)

    results = milvus.search(collection_name, query_hybrid, fields=["B"])
    for r in list(results):
        print("ids", r.ids)
        print("distances", r.distances)
        for rr in r:
            print(rr.entity.get("B"))

    # for result in results:
    #     for r in result:
    #         print(f"{r}")

    # itertor entity id
    # for result in results:
    #     for r in result:
    #         # get distance
    #         dis = r.distance
    #         id_ = r.id
    #         # obtain all field name
    #         fields = r.entity.fields
    #         for f in fields:
    #             # get field value by field name
    #             # fv = r.entity.
    #             fv = r.entity.value_of_field(f)
    #             print(fv)

    milvus.drop_collection(collection_name)
コード例 #7
0
ファイル: example_partition.py プロジェクト: zengxy/pymilvus
def main():
    # Connect to Milvus server
    # You may need to change _HOST and _PORT accordingly
    param = {'host': _HOST, 'port': _PORT}

    # You can create a instance specified server addr and
    # invoke rpc method directly
    client = Milvus(**param)
    # Create collection demo_collection if it dosen't exist.
    collection_name = 'demo_partition_collection'
    partition_tag = "random"

    # create collection
    param = {
        'collection_name': collection_name,
        'dimension': _DIM,
        'index_file_size': _INDEX_FILE_SIZE,  # optional
        'metric_type': MetricType.L2  # optional
    }

    client.create_collection(param)

    # Show collections in Milvus server
    _, collections = client.list_collections()

    # Describe collection
    _, collection = client.get_collection_info(collection_name)
    print(collection)

    # create partition
    client.create_partition(collection_name, partition_tag=partition_tag)
    # display partitions
    _, partitions = client.list_partitions(collection_name)

    # 10000 vectors with 16 dimension
    # element per dimension is float32 type
    # vectors should be a 2-D array
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10000)]
    # You can also use numpy to generate random vectors:
    #     `vectors = np.random.rand(10000, 16).astype(np.float32).tolist()`

    # Insert vectors into partition of collection, return status and vectors id list
    status, ids = client.insert(collection_name=collection_name, records=vectors, partition_tag=partition_tag)

    # Wait for 6 seconds, until Milvus server persist vector data.
    time.sleep(6)

    # Get demo_collection row count
    status, num = client.count_entities(collection_name)

    # create index of vectors, search more rapidly
    index_param = {
        'nlist': 2048
    }

    # Create ivflat index in demo_collection
    # You can search vectors without creating index. however, Creating index help to
    # search faster
    status = client.create_index(collection_name, IndexType.IVF_FLAT, index_param)

    # describe index, get information of index
    status, index = client.get_index_info(collection_name)
    print(index)

    # Use the top 10 vectors for similarity search
    query_vectors = vectors[0:10]

    # execute vector similarity search, search range in partition `partition1`
    search_param = {
        "nprobe": 10
    }

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'partition_tags': ["random"],
        'params': search_param
    }
    status, results = client.search(**param)

    if status.OK():
        # indicate search result
        # also use by:
        #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
        if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
            print('Query result is correct')
        else:
            print('Query result isn\'t correct')

    # print results
    print(results)

    # Drop partition. You can also invoke `drop_collection()`, so that all of partitions belongs to
    # designated collections will be deleted.
    status = client.drop_partition(collection_name, partition_tag)

    # Delete collection. All of partitions of this collection will be dropped.
    status = client.drop_collection(collection_name)
コード例 #8
0
ファイル: ingestEmbeddings.py プロジェクト: JC-R/tiering-t5
class MyMilvus():

    def __init__(self, name, host, port, collection_param):
        self.host = host
        self.port = port
        self.client = Milvus(host, port)
        self.collection_name = name
        self.collection_param = collection_param

        # self.collection_param = {
        #     "fields": [
        #         {"name": "release_year", "type": DataType.INT32},
        #         {"name": "embedding", "type": DataType.FLOAT_VECTOR, "params": {"dim": 8}},
        #     ],
        #     "segment_row_limit": 4096,
        #     "auto_id": False
        # }

    def create_collection(self):
        if self.collection_name not in self.client.list_collections():
            # self.client.drop_collection(self.collection_name)
            self.client.create_collection(self.collection_name, self.collection_param)

        # ------
        # Basic create index:
        #     Now that we have a collection in Milvus with `segment_row_limit` 4096, we can create index or
        #     insert entities.
        #
        #     We can call `create_index` BEFORE we insert any entities or AFTER. However Milvus won't actually
        #     start build index task if the segment row count is smaller than `segment_row_limit`. So if we want
        #     to make Milvus build index, we need to insert number of entities larger than `segment_row_limit`.
        #
        #     We are going to use data in `films.csv` so you can checkout the structure. And we need to group
        #     data with same fields together, so here is a example of how we obtain the data in files and transfer
        #     them into what we need.
        # ------

        ids = []  # ids
        titles = []  # titles
        release_years = []  # release year
        embeddings = []  # embeddings
        films = []
        with open('films.csv', 'r') as csvfile:
            reader = csv.reader(csvfile)
            films = [film for film in reader]
        for film in films:
            ids.append(int(film[0]))
            titles.append(film[1])
            release_years.append(int(film[2]))
            embeddings.append(list(map(float, film[3][1:][:-1].split(','))))

        hybrid_entities = [
            {"name": "release_year", "values": release_years, "type": DataType.INT32},
            {"name": "embedding", "values": embeddings, "type": DataType.FLOAT_VECTOR},
        ]

        # ------
        # Basic insert:
        #     After preparing the data, we are going to insert them into our collection.
        #     The number of films inserted should be 8657.
        # ------
        ids = self.client.insert(self.collection_name, hybrid_entities, ids)
        self.client.flush([self.collection_name])
        after_flush_counts = self.client.count_entities(self.collection_name)
        print(" > There are {} films in collection `{}` after flush".format(after_flush_counts, self.collection_name))

        # ------
        # Basic create index:
        #     Now that we have inserted all the films into Milvus, we are going to build index with these data.
        #
        #     While building index, we have to indicate which `field` to build index for, the `index_type`,
        #     `metric_type` and params for the specific index type. In our case, we want to build a `IVF_FLAT`
        #     index, so the specific params are "nlist". See pymilvus documentation
        #     (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html) for `index_type` we
        #     support and the params accordingly.
        #
        #     If there are already index for a collection and you call `create_index` with different params, the
        #     older index will be replaced by new one.
        # ------
        self.client.create_index(self.collection_name, "embedding",
                                 {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 100}})

        # ------
        # Basic create index:
        #     We can get the detail of the index  by `get_collection_info`.
        # ------
        info = self.client.get_collection_info(self.collection_name)
        pprint(info)

        # ------
        # Basic hybrid search entities:
        #     If we want to use index, the specific index params need to be provided, in our case, the "params"
        #     should be "nprobe", if no "params" given, Milvus will complain about it and raise a exception.
        # ------
        # query_embedding = [random.random() for _ in range(8)]
        # query_hybrid = {
        #     "bool": {
        #         "must": [
        #             {
        #                 "term": {"release_year": [2002, 1995]}
        #             },
        #             {
        #                 "vector": {
        #                     "embedding": {"topk": 3,
        #                                   "query": [query_embedding],
        #                                   "metric_type": "L2",
        #                                   "params": {"nprobe": 8}}
        #                 }
        #             }
        #         ]
        #     }
        # }

        # ------
        # Basic hybrid search entities
        # ------
        # results = client.search(collection_name, query_hybrid, fields=["release_year", "embedding"])
        # for entities in results:
        #     for topk_film in entities:
        #         current_entity = topk_film.entity
        #         print("==")
        #         print("- id: {}".format(topk_film.id))
        #         print("- title: {}".format(titles[topk_film.id]))
        #         print("- distance: {}".format(topk_film.distance))
        #
        #         print("- release_year: {}".format(current_entity.release_year))
        #         print("- embedding: {}".format(current_entity.embedding))

        # ------
        # Basic delete index:
        #     You can drop index for a field.
        # ------
        self.client.drop_index(self.collection_name, "embedding")

        if self.collection_name in self.client.list_collections():
            self.client.drop_collection(self.collection_name)