Esempio n. 1
0
class HNSW(ANN):
    """
    Builds an ANN model using the hnswlib library.
    """
    def load(self, path):
        # Load index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.load_index(path)

    def index(self, embeddings):
        # Inner product is equal to cosine similarity on normalized vectors
        self.config["metric"] = "ip"

        # Lookup index settings
        efconstruction = self.setting("efconstruction", 200)
        m = self.setting("m", 16)
        seed = self.setting("randomseed", 100)

        # Create index
        self.model = Index(dim=self.config["dimensions"],
                           space=self.config["metric"])
        self.model.init_index(max_elements=embeddings.shape[0],
                              ef_construction=efconstruction,
                              M=m,
                              random_seed=seed)

        # Add items
        self.model.add_items(embeddings, np.arange(embeddings.shape[0]))

        # Update id offset and set delete counter
        self.config["offset"] = embeddings.shape[0]
        self.config["deletes"] = 0

    def append(self, embeddings):
        new = embeddings.shape[0]

        # Resize index
        self.model.resize_index(self.config["offset"] + new)

        # Append new ids
        self.model.add_items(
            embeddings,
            np.arange(self.config["offset"], self.config["offset"] + new))

        # Update id offset
        self.config["offset"] += new

    def delete(self, ids):
        # Mark elements as deleted to omit from search results
        for uid in ids:
            try:
                self.model.mark_deleted(uid)
                self.config["deletes"] += 1
            except RuntimeError:
                # Ignore label not found error
                continue

    def search(self, queries, limit):
        # Set ef query param
        ef = self.setting("efsearch")
        if ef:
            self.model.set_ef(ef)

        # Run the query
        ids, distances = self.model.knn_query(queries, k=limit)

        # Map results to [(id, score)]
        results = []
        for x, distance in enumerate(distances):
            # Convert distances to similarity scores
            scores = [1 - d for d in distance]

            results.append(list(zip(ids[x], scores)))

        return results

    def count(self):
        return self.model.get_current_count() - self.config["deletes"]

    def save(self, path):
        # Write index
        self.model.save_index(path)
Esempio n. 2
0
class HNSW(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "dot",
        "l2",
        "ip",
    ]

    def __init__(self, *args, **kwargs):
        try:
            from hnswlib import Index  # pylint: disable=unused-import,unused-variable
        except ImportError:
            raise ImportError(
                "Please install hnswlib: `conda install -c conda-forge "
                "hnswlib` or `pip install hnswlib`."
            )
        super().__init__(*args, **kwargs)

    def build(self, data, k: int):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using HNSWlib approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from hnswlib import Index

        hnsw_space = {
            "cosine": "cosine",
            "dot": "ip",
            "euclidean": "l2",
            "ip": "ip",
            "l2": "l2",
        }[self.metric]

        self.index = Index(space=hnsw_space, dim=data.shape[1])

        # Initialize HNSW Index
        self.index.init_index(
            max_elements=data.shape[0],
            ef_construction=200,
            M=16,
            random_seed=self.random_state or 100,
        )

        # Build index tree from data
        self.index.add_items(data, num_threads=self.n_jobs)

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances, skip first entry, which is always the point itself
        return indices[:, 1:], distances[:, 1:]

    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using HNSWlib "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances
        return indices, distances
class HNSW(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "dot",
        "l2",
        "ip",
    ]

    def __init__(self, *args, **kwargs):
        try:
            from hnswlib import Index  # pylint: disable=unused-import,unused-variable
        except ImportError:
            raise ImportError(
                "Please install hnswlib: `conda install -c conda-forge "
                "hnswlib` or `pip install hnswlib`."
            )
        super().__init__(*args, **kwargs)

    def build(self):
        data, k = self.data, self.k

        timer = utils.Timer(
            f"Finding {k} nearest neighbors using HNSWlib approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from hnswlib import Index

        hnsw_space = {
            "cosine": "cosine",
            "dot": "ip",
            "euclidean": "l2",
            "ip": "ip",
            "l2": "l2",
        }[self.metric]

        random_state = check_random_state(self.random_state)
        random_seed = random_state.randint(np.iinfo(np.int32).max)

        self.index = Index(space=hnsw_space, dim=data.shape[1])

        # Initialize HNSW Index
        self.index.init_index(
            max_elements=data.shape[0],
            ef_construction=200,
            M=16,
            random_seed=random_seed,
        )

        # Build index tree from data
        self.index.add_items(data, num_threads=self.n_jobs)

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances, skip first entry, which is always the point itself
        return indices[:, 1:], distances[:, 1:]

    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using HNSWlib "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        # Set ef parameter for (ideal) precision/recall
        self.index.set_ef(min(2 * k, self.index.get_current_count()))

        # Query for kNN
        indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs)

        # Stop timer
        timer.__exit__()

        # return indices and distances
        return indices, distances

    def __getstate__(self):
        import tempfile
        import base64
        from os import path

        d = dict(self.__dict__)
        # If the index is not None, we want to save the encoded index
        if self.index is not None:
            with tempfile.TemporaryDirectory() as dirname:
                self.index.save_index(path.join(dirname, "tmp.bin"))

                with open(path.join(dirname, "tmp.bin"), "rb") as f:
                    b64_index = base64.b64encode(f.read())

            d["b64_index"] = b64_index
            del d["index"]

        return d

    def __setstate__(self, state):
        import tempfile
        import base64
        from os import path

        from hnswlib import Index

        # If a base64 index is given, we have to load the index
        if "b64_index" in state:
            assert "index" not in state
            b64_index = state["b64_index"]
            del state["b64_index"]

            hnsw_metric = state["metric"]
            hnsw_aliases = {
                "cosine": "cosine",
                "dot": "ip",
                "euclidean": "l2",
                "ip": "ip",
                "l2": "l2",
            }
            if hnsw_metric in hnsw_aliases:
                hnsw_metric = hnsw_aliases[hnsw_metric]

            self.index = Index(space=hnsw_metric, dim=state["data"].data.shape[1])
            with tempfile.TemporaryDirectory() as dirname:
                with open(path.join(dirname, "tmp.bin"), "wb") as f:
                    f.write(base64.b64decode(b64_index))
                self.index.load_index(path.join(dirname, "tmp.bin"))

        self.__dict__.update(state)