class HNSW(ANN): """ Builds an ANN model using the hnswlib library. """ def load(self, path): # Load index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.load_index(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "ip" # Lookup index settings efconstruction = self.setting("efconstruction", 200) m = self.setting("m", 16) seed = self.setting("randomseed", 100) # Create index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.init_index(max_elements=embeddings.shape[0], ef_construction=efconstruction, M=m, random_seed=seed) # Add items self.model.add_items(embeddings, np.arange(embeddings.shape[0])) # Update id offset and set delete counter self.config["offset"] = embeddings.shape[0] self.config["deletes"] = 0 def append(self, embeddings): new = embeddings.shape[0] # Resize index self.model.resize_index(self.config["offset"] + new) # Append new ids self.model.add_items( embeddings, np.arange(self.config["offset"], self.config["offset"] + new)) # Update id offset self.config["offset"] += new def delete(self, ids): # Mark elements as deleted to omit from search results for uid in ids: try: self.model.mark_deleted(uid) self.config["deletes"] += 1 except RuntimeError: # Ignore label not found error continue def search(self, queries, limit): # Set ef query param ef = self.setting("efsearch") if ef: self.model.set_ef(ef) # Run the query ids, distances = self.model.knn_query(queries, k=limit) # Map results to [(id, score)] results = [] for x, distance in enumerate(distances): # Convert distances to similarity scores scores = [1 - d for d in distance] results.append(list(zip(ids[x], scores))) return results def count(self): return self.model.get_current_count() - self.config["deletes"] def save(self, path): # Write index self.model.save_index(path)
class HNSW(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "dot", "l2", "ip", ] def __init__(self, *args, **kwargs): try: from hnswlib import Index # pylint: disable=unused-import,unused-variable except ImportError: raise ImportError( "Please install hnswlib: `conda install -c conda-forge " "hnswlib` or `pip install hnswlib`." ) super().__init__(*args, **kwargs) def build(self, data, k: int): timer = utils.Timer( f"Finding {k} nearest neighbors using HNSWlib approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from hnswlib import Index hnsw_space = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", }[self.metric] self.index = Index(space=hnsw_space, dim=data.shape[1]) # Initialize HNSW Index self.index.init_index( max_elements=data.shape[0], ef_construction=200, M=16, random_seed=self.random_state or 100, ) # Build index tree from data self.index.add_items(data, num_threads=self.n_jobs) # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances, skip first entry, which is always the point itself return indices[:, 1:], distances[:, 1:] def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using HNSWlib " f"approximate search...", self.verbose, ) timer.__enter__() # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances return indices, distances
class HNSW(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "dot", "l2", "ip", ] def __init__(self, *args, **kwargs): try: from hnswlib import Index # pylint: disable=unused-import,unused-variable except ImportError: raise ImportError( "Please install hnswlib: `conda install -c conda-forge " "hnswlib` or `pip install hnswlib`." ) super().__init__(*args, **kwargs) def build(self): data, k = self.data, self.k timer = utils.Timer( f"Finding {k} nearest neighbors using HNSWlib approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from hnswlib import Index hnsw_space = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", }[self.metric] random_state = check_random_state(self.random_state) random_seed = random_state.randint(np.iinfo(np.int32).max) self.index = Index(space=hnsw_space, dim=data.shape[1]) # Initialize HNSW Index self.index.init_index( max_elements=data.shape[0], ef_construction=200, M=16, random_seed=random_seed, ) # Build index tree from data self.index.add_items(data, num_threads=self.n_jobs) # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances, skip first entry, which is always the point itself return indices[:, 1:], distances[:, 1:] def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using HNSWlib " f"approximate search...", self.verbose, ) timer.__enter__() # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances return indices, distances def __getstate__(self): import tempfile import base64 from os import path d = dict(self.__dict__) # If the index is not None, we want to save the encoded index if self.index is not None: with tempfile.TemporaryDirectory() as dirname: self.index.save_index(path.join(dirname, "tmp.bin")) with open(path.join(dirname, "tmp.bin"), "rb") as f: b64_index = base64.b64encode(f.read()) d["b64_index"] = b64_index del d["index"] return d def __setstate__(self, state): import tempfile import base64 from os import path from hnswlib import Index # If a base64 index is given, we have to load the index if "b64_index" in state: assert "index" not in state b64_index = state["b64_index"] del state["b64_index"] hnsw_metric = state["metric"] hnsw_aliases = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", } if hnsw_metric in hnsw_aliases: hnsw_metric = hnsw_aliases[hnsw_metric] self.index = Index(space=hnsw_metric, dim=state["data"].data.shape[1]) with tempfile.TemporaryDirectory() as dirname: with open(path.join(dirname, "tmp.bin"), "wb") as f: f.write(base64.b64decode(b64_index)) self.index.load_index(path.join(dirname, "tmp.bin")) self.__dict__.update(state)