Beispiel #1
0
class FasttextHnsw:
    def __init__(self, model):
        self.model = model
        self.vector_dim = model.get_dimension()
        self.hnsw = HnswIndex(space='l2', dim=self.vector_dim * 3)

    def words_to_embeddings(self, words):
        embeddings = np.zeros((len(words), self.vector_dim))
        for i, w in enumerate(words):
            embeddings[i] = self.model.get_word_vector(w)
            embeddings[i] /= np.linalg.norm(embeddings[i])
        return embeddings

    def embed_text(self, text):
        words = tokenize(text)
        norm_vectors = self.words_to_embeddings(words)
        avg_wv = np.mean(norm_vectors, axis=0)
        max_wv = np.max(norm_vectors, axis=0)
        min_wv = np.min(norm_vectors, axis=0)
        return np.concatenate((avg_wv, max_wv, min_wv))

    def build_hnsw(self, texts):
        n = len(texts)
        self.hnsw.init_index(max_elements=n, ef_construction=100, M=16)
        embeddings = np.zeros((n, self.vector_dim * 3))
        for i, text in enumerate(texts):
            embeddings[i] = self.embed_text(text)
        self.hnsw.add_items(embeddings)
Beispiel #2
0
class HNSW(ANN):
    """
    Builds an ANN model using the hnswlib library.
    """
    def load(self, path):
        # Load index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.load_index(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "ip"

        # Create index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.init_index(max_elements=embeddings.shape[0])

        # Add items
        self.model.add_items(embeddings, np.array(range(embeddings.shape[0])))

    def search(self, query, limit):
        # Run the query
        ids, distances = self.model.knn_query(query.reshape(1, -1), k=limit)

        # Convert distances to similarity scores
        scores = [1 - d for d in distances[0]]

        # Map results to [(id, score)]
        return list(zip(ids[0], scores))

    def save(self, path):
        # Write index
        self.model.save_index(path)
Beispiel #3
0
class HNSW(ANN):
    """
    Builds an ANN model using the hnswlib library.
    """
    def load(self, path):
        # Load index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.load_index(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "ip"

        # Lookup index settings
        efconstruction = self.setting("efconstruction", 200)
        m = self.setting("m", 16)
        seed = self.setting("randomseed", 100)

        # Create index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.init_index(max_elements=embeddings.shape[0],
                              ef_construction=efconstruction,
                              M=m,
                              random_seed=seed)

        # Add items
        self.model.add_items(embeddings, np.array(range(embeddings.shape[0])))

    def search(self, queries, limit):
        # Set ef query param
        ef = self.setting("efsearch")
        if ef:
            self.model.set_ef(ef)

        # Run the query
        ids, distances = self.model.knn_query(queries, k=limit)

        # Map results to [(id, score)]
        results = []
        for x, distance in enumerate(distances):
            # Convert distances to similarity scores
            scores = [1 - d for d in distance]

            results.append(list(zip(ids[x], scores)))

        return results

    def save(self, path):
        # Write index
        self.model.save_index(path)
Beispiel #4
0
class HNSW(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "dot",
        "l2",
        "ip",
    ]

    def __init__(self, *args, **kwargs):
        try:
            from hnswlib import Index  # pylint: disable=unused-import,unused-variable
        except ImportError:
            raise ImportError(
                "Please install hnswlib: `conda install -c conda-forge "
                "hnswlib` or `pip install hnswlib`."
            )
        super().__init__(*args, **kwargs)

    def build(self, data, k: int):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using HNSWlib approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from hnswlib import Index

        hnsw_space = {
            "cosine": "cosine",
            "dot": "ip",
            "euclidean": "l2",
            "ip": "ip",
            "l2": "l2",
        }[self.metric]

        self.index = Index(space=hnsw_space, dim=data.shape[1])

        # Initialize HNSW Index
        self.index.init_index(
            max_elements=data.shape[0],
            ef_construction=200,
            M=16,
            random_seed=self.random_state or 100,
        )

        # Build index tree from data
        self.index.add_items(data, num_threads=self.n_jobs)

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances, skip first entry, which is always the point itself
        return indices[:, 1:], distances[:, 1:]

    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using HNSWlib "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances
        return indices, distances
Beispiel #5
0
class HNSW(ANN):
    """
    Builds an ANN model using the hnswlib library.
    """
    def load(self, path):
        # Load index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.load_index(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "ip"

        # Lookup index settings
        efconstruction = self.setting("efconstruction", 200)
        m = self.setting("m", 16)
        seed = self.setting("randomseed", 100)

        # Create index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.init_index(max_elements=embeddings.shape[0],
                              ef_construction=efconstruction,
                              M=m,
                              random_seed=seed)

        # Add items
        self.model.add_items(embeddings, np.arange(embeddings.shape[0]))

        # Update id offset and set delete counter
        self.config["offset"] = embeddings.shape[0]
        self.config["deletes"] = 0

    def append(self, embeddings):
        new = embeddings.shape[0]

        # Resize index
        self.model.resize_index(self.config["offset"] + new)

        # Append new ids
        self.model.add_items(
            embeddings,
            np.arange(self.config["offset"], self.config["offset"] + new))

        # Update id offset
        self.config["offset"] += new

    def delete(self, ids):
        # Mark elements as deleted to omit from search results
        for uid in ids:
            try:
                self.model.mark_deleted(uid)
                self.config["deletes"] += 1
            except RuntimeError:
                # Ignore label not found error
                continue

    def search(self, queries, limit):
        # Set ef query param
        ef = self.setting("efsearch")
        if ef:
            self.model.set_ef(ef)

        # Run the query
        ids, distances = self.model.knn_query(queries, k=limit)

        # Map results to [(id, score)]
        results = []
        for x, distance in enumerate(distances):
            # Convert distances to similarity scores
            scores = [1 - d for d in distance]

            results.append(list(zip(ids[x], scores)))

        return results

    def count(self):
        return self.model.get_current_count() - self.config["deletes"]

    def save(self, path):
        # Write index
        self.model.save_index(path)
Beispiel #6
0
class SimpleRails:

    def __init__(
            self,
            dim: int,
            total_frames: int,
            hnsw_space='l2',
            hnsw_ef_construction=200,
            hnsw_M=16,
        ):
        self.dim = dim
        self.total_frames = total_frames

        self.hnsw_space = hnsw_space
        self.hnsw_ef_construction = hnsw_ef_construction
        self.hnsw_M = hnsw_M

        self.index = Index(space=hnsw_space, dim=dim)
        self.index.init_index(
            max_elements=total_frames,
            ef_construction=hnsw_ef_construction,
            M=hnsw_M,
        )

    def add(self, feature: AudioFeatureType, idxs: FrameIdxType):
        assert len(idxs.shape) == 1  # 1-D
        assert feature.shape[0] == idxs.shape[0]  # same size
        assert idxs.max() < self.total_frames
        self.index.add_items(feature, idxs)

    def set_query_params(
            self,
            ef=200,
            n_nearest_frames=100,
            n_hough_peaks=100,
            offset_merge_threshold=10,
        ):
        self.index.set_ef(ef)
        self.n_nearest_frames = n_nearest_frames
        self.n_hough_peaks = n_hough_peaks
        self.offset_merge_threshold = offset_merge_threshold

    def query(self, feature: AudioFeatureType) -> IndexQueryResult:
        knn_points, _distances = self.index.knn_query(feature, k=self.n_nearest_frames)

        accumulations = HoughAccumulations()
        for m_idx, n_idxs in enumerate(list(knn_points)):
            # slope constraint
            slope_candidates = [1]
            for slope in slope_candidates:
                for n_idx in list(n_idxs):
                    offset = slope * -m_idx + n_idx
                    accumulations.add(slope, offset, n_idx)

        candidates = accumulations.peaks(self.n_hough_peaks)

        merged = set()
        result = []
        for idx, ((_, offset), count, points) in enumerate(candidates):
            if idx in merged:
                continue
            cur_left = min(points)
            cur_right = max(points)
            cur_count = count
            for idx2 in range(idx + 1, self.n_hough_peaks):
                if idx2 in merged:
                    continue
                (_, offset_2), count_2, points_2 = candidates[idx2]
                if abs((offset - offset_2)) < self.offset_merge_threshold:
                    cur_count += count_2
                    cur_left = min(cur_left, min(points_2))
                    cur_right = max(cur_right, max(points_2))
                    merged.add(idx2)
            result.append((cur_count, cur_left, cur_right))  # score, start_frame, end_frame
        return result

    def save(self, path):
        try:
            path = Path(path)
            path.mkdir(mode=0o775, parents=True, exist_ok=True)
            self.index.save_index(str(path / SAVED_INDEX_NAME))
            build_args = {
                'dim': self.dim,
                'total_frames': self.total_frames,
                'hnsw_space': self.hnsw_space,
                'hnsw_ef_construction': self.hnsw_ef_construction,
                'hnsw_M': self.hnsw_M,
            }
            with open(str(path / SAVED_BUILD_ARGS_NAME), 'w') as fw:
                json.dump(build_args, fw)
            return True
        except Exception as e:
            print(e)
            return False

    @classmethod
    def load(cls, path):
        path = Path(path)
        with open(str(path / SAVED_BUILD_ARGS_NAME), 'r') as f:
            build_args = json.load(f)

        index = cls(**build_args)
        index.index.load_index(str(path / SAVED_INDEX_NAME))
        return index
class HNSW(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "dot",
        "l2",
        "ip",
    ]

    def __init__(self, *args, **kwargs):
        try:
            from hnswlib import Index  # pylint: disable=unused-import,unused-variable
        except ImportError:
            raise ImportError(
                "Please install hnswlib: `conda install -c conda-forge "
                "hnswlib` or `pip install hnswlib`."
            )
        super().__init__(*args, **kwargs)

    def build(self):
        data, k = self.data, self.k

        timer = utils.Timer(
            f"Finding {k} nearest neighbors using HNSWlib approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from hnswlib import Index

        hnsw_space = {
            "cosine": "cosine",
            "dot": "ip",
            "euclidean": "l2",
            "ip": "ip",
            "l2": "l2",
        }[self.metric]

        random_state = check_random_state(self.random_state)
        random_seed = random_state.randint(np.iinfo(np.int32).max)

        self.index = Index(space=hnsw_space, dim=data.shape[1])

        # Initialize HNSW Index
        self.index.init_index(
            max_elements=data.shape[0],
            ef_construction=200,
            M=16,
            random_seed=random_seed,
        )

        # Build index tree from data
        self.index.add_items(data, num_threads=self.n_jobs)

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances, skip first entry, which is always the point itself
        return indices[:, 1:], distances[:, 1:]

    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using HNSWlib "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances
        return indices, distances

    def __getstate__(self):
        import tempfile
        import base64
        from os import path

        d = dict(self.__dict__)
        # If the index is not None, we want to save the encoded index
        if self.index is not None:
            with tempfile.TemporaryDirectory() as dirname:
                self.index.save_index(path.join(dirname, "tmp.bin"))

                with open(path.join(dirname, "tmp.bin"), "rb") as f:
                    b64_index = base64.b64encode(f.read())

            d["b64_index"] = b64_index
            del d["index"]

        return d

    def __setstate__(self, state):
        import tempfile
        import base64
        from os import path

        from hnswlib import Index

        # If a base64 index is given, we have to load the index
        if "b64_index" in state:
            assert "index" not in state
            b64_index = state["b64_index"]
            del state["b64_index"]

            hnsw_metric = state["metric"]
            hnsw_aliases = {
                "cosine": "cosine",
                "dot": "ip",
                "euclidean": "l2",
                "ip": "ip",
                "l2": "l2",
            }
            if hnsw_metric in hnsw_aliases:
                hnsw_metric = hnsw_aliases[hnsw_metric]

            self.index = Index(space=hnsw_metric, dim=state["data"].data.shape[1])
            with tempfile.TemporaryDirectory() as dirname:
                with open(path.join(dirname, "tmp.bin"), "wb") as f:
                    f.write(base64.b64decode(b64_index))
                self.index.load_index(path.join(dirname, "tmp.bin"))

        self.__dict__.update(state)