class FasttextHnsw: def __init__(self, model): self.model = model self.vector_dim = model.get_dimension() self.hnsw = HnswIndex(space='l2', dim=self.vector_dim * 3) def words_to_embeddings(self, words): embeddings = np.zeros((len(words), self.vector_dim)) for i, w in enumerate(words): embeddings[i] = self.model.get_word_vector(w) embeddings[i] /= np.linalg.norm(embeddings[i]) return embeddings def embed_text(self, text): words = tokenize(text) norm_vectors = self.words_to_embeddings(words) avg_wv = np.mean(norm_vectors, axis=0) max_wv = np.max(norm_vectors, axis=0) min_wv = np.min(norm_vectors, axis=0) return np.concatenate((avg_wv, max_wv, min_wv)) def build_hnsw(self, texts): n = len(texts) self.hnsw.init_index(max_elements=n, ef_construction=100, M=16) embeddings = np.zeros((n, self.vector_dim * 3)) for i, text in enumerate(texts): embeddings[i] = self.embed_text(text) self.hnsw.add_items(embeddings)
class HNSW(ANN): """ Builds an ANN model using the hnswlib library. """ def load(self, path): # Load index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.load_index(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "ip" # Create index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.init_index(max_elements=embeddings.shape[0]) # Add items self.model.add_items(embeddings, np.array(range(embeddings.shape[0]))) def search(self, query, limit): # Run the query ids, distances = self.model.knn_query(query.reshape(1, -1), k=limit) # Convert distances to similarity scores scores = [1 - d for d in distances[0]] # Map results to [(id, score)] return list(zip(ids[0], scores)) def save(self, path): # Write index self.model.save_index(path)
class HNSW(ANN): """ Builds an ANN model using the hnswlib library. """ def load(self, path): # Load index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.load_index(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "ip" # Lookup index settings efconstruction = self.setting("efconstruction", 200) m = self.setting("m", 16) seed = self.setting("randomseed", 100) # Create index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.init_index(max_elements=embeddings.shape[0], ef_construction=efconstruction, M=m, random_seed=seed) # Add items self.model.add_items(embeddings, np.array(range(embeddings.shape[0]))) def search(self, queries, limit): # Set ef query param ef = self.setting("efsearch") if ef: self.model.set_ef(ef) # Run the query ids, distances = self.model.knn_query(queries, k=limit) # Map results to [(id, score)] results = [] for x, distance in enumerate(distances): # Convert distances to similarity scores scores = [1 - d for d in distance] results.append(list(zip(ids[x], scores))) return results def save(self, path): # Write index self.model.save_index(path)
class HNSW(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "dot", "l2", "ip", ] def __init__(self, *args, **kwargs): try: from hnswlib import Index # pylint: disable=unused-import,unused-variable except ImportError: raise ImportError( "Please install hnswlib: `conda install -c conda-forge " "hnswlib` or `pip install hnswlib`." ) super().__init__(*args, **kwargs) def build(self, data, k: int): timer = utils.Timer( f"Finding {k} nearest neighbors using HNSWlib approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from hnswlib import Index hnsw_space = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", }[self.metric] self.index = Index(space=hnsw_space, dim=data.shape[1]) # Initialize HNSW Index self.index.init_index( max_elements=data.shape[0], ef_construction=200, M=16, random_seed=self.random_state or 100, ) # Build index tree from data self.index.add_items(data, num_threads=self.n_jobs) # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances, skip first entry, which is always the point itself return indices[:, 1:], distances[:, 1:] def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using HNSWlib " f"approximate search...", self.verbose, ) timer.__enter__() # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances return indices, distances
class HNSW(ANN): """ Builds an ANN model using the hnswlib library. """ def load(self, path): # Load index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.load_index(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "ip" # Lookup index settings efconstruction = self.setting("efconstruction", 200) m = self.setting("m", 16) seed = self.setting("randomseed", 100) # Create index self.model = Index(dim=self.config["dimensions"], space=self.config["metric"]) self.model.init_index(max_elements=embeddings.shape[0], ef_construction=efconstruction, M=m, random_seed=seed) # Add items self.model.add_items(embeddings, np.arange(embeddings.shape[0])) # Update id offset and set delete counter self.config["offset"] = embeddings.shape[0] self.config["deletes"] = 0 def append(self, embeddings): new = embeddings.shape[0] # Resize index self.model.resize_index(self.config["offset"] + new) # Append new ids self.model.add_items( embeddings, np.arange(self.config["offset"], self.config["offset"] + new)) # Update id offset self.config["offset"] += new def delete(self, ids): # Mark elements as deleted to omit from search results for uid in ids: try: self.model.mark_deleted(uid) self.config["deletes"] += 1 except RuntimeError: # Ignore label not found error continue def search(self, queries, limit): # Set ef query param ef = self.setting("efsearch") if ef: self.model.set_ef(ef) # Run the query ids, distances = self.model.knn_query(queries, k=limit) # Map results to [(id, score)] results = [] for x, distance in enumerate(distances): # Convert distances to similarity scores scores = [1 - d for d in distance] results.append(list(zip(ids[x], scores))) return results def count(self): return self.model.get_current_count() - self.config["deletes"] def save(self, path): # Write index self.model.save_index(path)
class SimpleRails: def __init__( self, dim: int, total_frames: int, hnsw_space='l2', hnsw_ef_construction=200, hnsw_M=16, ): self.dim = dim self.total_frames = total_frames self.hnsw_space = hnsw_space self.hnsw_ef_construction = hnsw_ef_construction self.hnsw_M = hnsw_M self.index = Index(space=hnsw_space, dim=dim) self.index.init_index( max_elements=total_frames, ef_construction=hnsw_ef_construction, M=hnsw_M, ) def add(self, feature: AudioFeatureType, idxs: FrameIdxType): assert len(idxs.shape) == 1 # 1-D assert feature.shape[0] == idxs.shape[0] # same size assert idxs.max() < self.total_frames self.index.add_items(feature, idxs) def set_query_params( self, ef=200, n_nearest_frames=100, n_hough_peaks=100, offset_merge_threshold=10, ): self.index.set_ef(ef) self.n_nearest_frames = n_nearest_frames self.n_hough_peaks = n_hough_peaks self.offset_merge_threshold = offset_merge_threshold def query(self, feature: AudioFeatureType) -> IndexQueryResult: knn_points, _distances = self.index.knn_query(feature, k=self.n_nearest_frames) accumulations = HoughAccumulations() for m_idx, n_idxs in enumerate(list(knn_points)): # slope constraint slope_candidates = [1] for slope in slope_candidates: for n_idx in list(n_idxs): offset = slope * -m_idx + n_idx accumulations.add(slope, offset, n_idx) candidates = accumulations.peaks(self.n_hough_peaks) merged = set() result = [] for idx, ((_, offset), count, points) in enumerate(candidates): if idx in merged: continue cur_left = min(points) cur_right = max(points) cur_count = count for idx2 in range(idx + 1, self.n_hough_peaks): if idx2 in merged: continue (_, offset_2), count_2, points_2 = candidates[idx2] if abs((offset - offset_2)) < self.offset_merge_threshold: cur_count += count_2 cur_left = min(cur_left, min(points_2)) cur_right = max(cur_right, max(points_2)) merged.add(idx2) result.append((cur_count, cur_left, cur_right)) # score, start_frame, end_frame return result def save(self, path): try: path = Path(path) path.mkdir(mode=0o775, parents=True, exist_ok=True) self.index.save_index(str(path / SAVED_INDEX_NAME)) build_args = { 'dim': self.dim, 'total_frames': self.total_frames, 'hnsw_space': self.hnsw_space, 'hnsw_ef_construction': self.hnsw_ef_construction, 'hnsw_M': self.hnsw_M, } with open(str(path / SAVED_BUILD_ARGS_NAME), 'w') as fw: json.dump(build_args, fw) return True except Exception as e: print(e) return False @classmethod def load(cls, path): path = Path(path) with open(str(path / SAVED_BUILD_ARGS_NAME), 'r') as f: build_args = json.load(f) index = cls(**build_args) index.index.load_index(str(path / SAVED_INDEX_NAME)) return index
class HNSW(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "dot", "l2", "ip", ] def __init__(self, *args, **kwargs): try: from hnswlib import Index # pylint: disable=unused-import,unused-variable except ImportError: raise ImportError( "Please install hnswlib: `conda install -c conda-forge " "hnswlib` or `pip install hnswlib`." ) super().__init__(*args, **kwargs) def build(self): data, k = self.data, self.k timer = utils.Timer( f"Finding {k} nearest neighbors using HNSWlib approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from hnswlib import Index hnsw_space = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", }[self.metric] random_state = check_random_state(self.random_state) random_seed = random_state.randint(np.iinfo(np.int32).max) self.index = Index(space=hnsw_space, dim=data.shape[1]) # Initialize HNSW Index self.index.init_index( max_elements=data.shape[0], ef_construction=200, M=16, random_seed=random_seed, ) # Build index tree from data self.index.add_items(data, num_threads=self.n_jobs) # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances, skip first entry, which is always the point itself return indices[:, 1:], distances[:, 1:] def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using HNSWlib " f"approximate search...", self.verbose, ) timer.__enter__() # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances return indices, distances def __getstate__(self): import tempfile import base64 from os import path d = dict(self.__dict__) # If the index is not None, we want to save the encoded index if self.index is not None: with tempfile.TemporaryDirectory() as dirname: self.index.save_index(path.join(dirname, "tmp.bin")) with open(path.join(dirname, "tmp.bin"), "rb") as f: b64_index = base64.b64encode(f.read()) d["b64_index"] = b64_index del d["index"] return d def __setstate__(self, state): import tempfile import base64 from os import path from hnswlib import Index # If a base64 index is given, we have to load the index if "b64_index" in state: assert "index" not in state b64_index = state["b64_index"] del state["b64_index"] hnsw_metric = state["metric"] hnsw_aliases = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", } if hnsw_metric in hnsw_aliases: hnsw_metric = hnsw_aliases[hnsw_metric] self.index = Index(space=hnsw_metric, dim=state["data"].data.shape[1]) with tempfile.TemporaryDirectory() as dirname: with open(path.join(dirname, "tmp.bin"), "wb") as f: f.write(base64.b64decode(b64_index)) self.index.load_index(path.join(dirname, "tmp.bin")) self.__dict__.update(state)