def create_collections_and_insert_data(): import random import time dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nList collections...") print(list_collections()) for col_name in all_index_types: print(f"\nCreate collection...") collection = Collection(name=col_name, schema=default_schema) # insert data nb = 3000 vectors = [[i / nb for _ in range(dim)] for i in range(nb)] collection.insert( [[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"collection name: {col_name}") print("Get collection entities") start_time = time.time() print(f"collection entities: {collection.num_entities}") end_time = time.time() print("Get collection entities time = %.4fs" % (end_time - start_time)) print(f"\nList collections...") print(list_collections())
def test_auto_load_balance(self): """ """ log.info(f"start to install milvus") release_name, host, port = install_milvus( "test-auto-load-balance") # todo add release name self.release_name = release_name assert host is not None conn = connections.connect("default", host=host, port=port) assert conn is not None self.health_checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } cc.start_monitor_threads(self.health_checkers) # wait sleep(constants.WAIT_PER_OP * 10) all_collections = list_collections() for c in all_collections: seg_info = utility.get_query_segment_info(c) seg_distribution = cf.get_segment_distribution(seg_info) for k in seg_distribution.keys(): log.info( f"collection {c}'s segment distribution in node {k} is {seg_distribution[k]['sealed']}" ) # first assert log.info("first assert") assert_statistic(self.health_checkers) # scale up log.info("scale up milvus") scale_up_milvus(self.release_name) # reset counting cc.reset_counting(self.health_checkers) sleep(constants.WAIT_PER_OP * 10) all_collections = list_collections() for c in all_collections: seg_info = utility.get_query_segment_info(c) seg_distribution = cf.get_segment_distribution(seg_info) for k in seg_distribution.keys(): log.info( f"collection {c}'s sealed segment distribution in node {k} is {seg_distribution[k]['sealed']}" ) # second assert log.info("second assert") assert_statistic(self.health_checkers) # TODO assert segment distribution # assert all expectations assert_expectations()
def filter_collections_by_prefix(prefix): col_list = list_collections() res = [] for col in col_list: if col.startswith(prefix): res.append(col) return res
def test_partition(): connections.connect(alias="default") print("create collection") collection = Collection(name=gen_unique_str(), schema=gen_default_fields()) print("create partition") partition = Partition(collection, name=gen_unique_str()) print(list_collections()) assert has_partition(collection.name, partition.name) is True data = gen_data(default_nb) print("insert data to partition") partition.insert(data) assert partition.is_empty is False assert partition.num_entities == default_nb print("load partition") partition.load() topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} print("search partition") res = partition.search(data[2][-2:], "float_vector", search_params, topK, "count > 100") for hits in res: for hit in hits: print(hit) print("release partition") partition.release() print("drop partition") partition.drop() print("drop collection") collection.drop()
def get_collections(): print(f"\nList collections...") col_list = list_collections() print(f"collections_nums: {len(col_list)}") # list entities if collections for name in col_list: c = Collection(name=name) print(f"{name}: {c.num_entities}")
def create_index(): # create index default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } col_list = list_collections() print(f"\nCreate index...") for name in col_list: c = Collection(name=name) print(name) print(c) index = copy.deepcopy(default_index) index["index_type"] = name index["params"] = index_params_map[name] if name in ["BIN_FLAT", "BIN_IVF_FLAT"]: index["metric_type"] = "HAMMING" c.create_index(field_name="float_vector", index_params=index)
def load_and_search(): print("search data starts") col_list = list_collections() for name in col_list: c = Collection(name=name) print(f"collection name: {name}") c.load() topK = 5 vectors = [[0.0 for _ in range(128)] for _ in range(3000)] index_type = name search_params = gen_search_param(index_type)[0] print(search_params) # search_params = {"metric_type": "L2", "params": {"nprobe": 10}} import time start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = c.search(vectors[:1], "float_vector", search_params, topK, "count > 500", output_fields=["count", "random_value"], timeout=20) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) ids = hits.ids print(ids) print("###########") print("search latency = %.4fs" % (end_time - start_time)) c.release() print("search data ends")
def save_all_checker_collections(host="127.0.0.1", prefix="Checker"): # create connection connections.connect(host=host, port="19530") all_collections = list_collections() if prefix is None: all_collections = [c_name for c_name in all_collections] else: all_collections = [ c_name for c_name in all_collections if prefix in c_name ] m = defaultdict(list) for c_name in all_collections: prefix = c_name.split("_")[0] if len(m[prefix]) <= 10: m[prefix].append(c_name) selected_collections = [] for v in m.values(): selected_collections.extend(v) data = {"all": selected_collections} print("selected_collections is") print(selected_collections) with open("/tmp/ci_logs/all_collections.json", "w") as f: f.write(json.dumps(data))
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=65535), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [np.float32(i) for i in range(nb)], [str(i) for i in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0:.4f} seconds") # create index and load table default_index = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0:.4f} seconds") print("\nGet replicas number") try: replicas_info = collection.get_replicas() replica_number = len(replicas_info.groups) print(f"\nReplicas number is {replica_number}") except Exception as e: print(str(e)) replica_number = 1 print(f"\nload collection...") t0 = time.time() collection.load(replica_number=replica_number) t1 = time.time() print(f"\nload collection cost {t1 - t0:.4f} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} t0 = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "int64 > 100", output_fields=["int64", "float"], timeout=TIMEOUT) t1 = time.time() print(f"search cost {t1 - t0:.4f} seconds") # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("float")) # query expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r)
sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r) parser = argparse.ArgumentParser(description='host ip') parser.add_argument('--host', type=str, default='127.0.0.1', help='host ip') args = parser.parse_args() # add time stamp print( f"\nStart time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}" ) # create connection connections.connect(host=args.host, port="19530") print("\nList collections...") all_collections = list_collections() print(all_collections) all_collections = [c_name for c_name in all_collections if "Checker" in c_name] m = defaultdict(list) for c_name in all_collections: prefix = c_name.split("_")[0] if len(m[prefix]) <= 5: m[prefix].append(c_name) selected_collections = [] for v in m.values(): selected_collections.extend(v) print("selected_collections is") print(selected_collections) cnt = 0 for collection_name in selected_collections: print(f"check collection {collection_name}")
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0} seconds") # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0} seconds") print(f"\nload collection...") t0 = time.time() collection.load() t1 = time.time() print(f"\nload collection cost {t1 - t0} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # query expr = "count in [2,4,6,8]" output_fields = ["count", "random_value"] res = collection.query(expr, output_fields) sorted_res = sorted(res, key=lambda k: k['count']) for r in sorted_res: print(r)
expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r) collection.release() parser = argparse.ArgumentParser(description='host ip') parser.add_argument('--host', type=str, default='10.96.77.209', help='host ip') args = parser.parse_args() # add time stamp print( f"\nStart time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}" ) # create connection connections.connect(host=args.host, port="19530") print(f"\nList collections...") collection_list = list_collections() print(collection_list) # keep 10 collections with prefix "CreateChecker_", others will be skiped cnt = 0 for collection_name in collection_list: if collection_name.startswith("CreateChecker_"): cnt += 1 if collection_name.startswith("CreateChecker_") and cnt > 10: continue print(f"check collection {collection_name}") hello_milvus(collection_name)
import random from pymilvus import (connections, list_collections, FieldSchema, CollectionSchema, DataType, Collection, Partition, utility) # configure milvus hostname and port print(f"\nCreate connection...") connections.connect() # List all collection names print(f"\nList collections...") print(list_collections()) # Create a collection named 'demo_film_tutorial' print(f"\nCreate collection...") field1 = FieldSchema(name="release_year", dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, description="float vector", dim=8, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") collection = Collection(name='demo_film_tutorial', data=None, schema=schema) # List all collection names print(f"\nList collections...")
def hello_milvus(): # create connection connections.connect() print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"\nGet collection entities...") print(collection.num_entities) # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") collection.create_index(field_name="float_vector", index_params=default_index) print(f"\nload collection...") collection.load() # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} import time start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # drop collection collection.drop()