コード例 #1
0
    def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever
        :param index: Index name to update
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()")

        docs = self.get_all_documents(index)
        passages = [d.text for d in docs]

        #TODO Index embeddings every X batches to avoid OOM for huge document collections
        logger.info(f"Updating embeddings for {len(passages)} docs ...")

        # TODO send whole Document to retriever and let retriever decide what fields to embed
        from haystack.retriever.dense import DensePassageRetriever
        if isinstance(retriever,DensePassageRetriever):
            titles = []
            for d in docs:
                if d.meta is not None:
                    titles.append(d.meta['name'] if 'name' in d.meta.keys() else None)
            if len(titles) == len(passages):
                embeddings = retriever.embed_passages(passages,titles)  # type: ignore
            else:
                embeddings = retriever.embed_passages(passages)  # type: ignore
        else: #EmbeddingRetriever
            embeddings = retriever.embed_passages(passages)  # type: ignore

        assert len(docs) == len(embeddings)

        if embeddings[0].shape[0] != self.embedding_dim:
            raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
                               f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
                               "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()")
        doc_updates = []
        for doc, emb in zip(docs, embeddings):
            update = {"_op_type": "update",
                      "_index": index,
                      "_id": doc.id,
                      "doc": {self.embedding_field: emb.tolist()},
                      }
            doc_updates.append(update)

        bulk(self.client, doc_updates, request_timeout=300)
コード例 #2
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever
        :param index: Index name to update
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError(
                "Specify the arg embedding_field when initializing InMemoryDocumentStore()"
            )

        # TODO Index embeddings every X batches to avoid OOM for huge document collections
        docs = self.get_all_documents(index)
        logger.info(f"Updating embeddings for {len(docs)} docs ...")
        embeddings = retriever.embed_passages(docs)  # type: ignore
        assert len(docs) == len(embeddings)

        if embeddings[0].shape[0] != self.embedding_dim:
            raise RuntimeError(
                f"Embedding dim. of model ({embeddings[0].shape[0]})"
                f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                "Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()"
            )

        for doc, emb in zip(docs, embeddings):
            self.indexes[index][doc.id].embedding = emb
コード例 #3
0
def update_elastic_embeddings(document_store: ElasticsearchDocumentStore,
                              retriever: BaseRetriever,
                              update_existing=False):
    index = document_store.index

    result = document_store.get_all_documents_generator(index)
    for document_batch in get_batches_from_generator(result, 10_000):
        if len(document_batch) == 0:
            break
        if not update_existing:
            # take only documents with no embeddings
            document_batch = [d for d in document_batch if d.embedding is None]
        if len(document_batch) == 0:
            continue
        embeddings = retriever.embed_passages(document_batch)  # type: ignore
        assert len(document_batch) == len(embeddings)
        print('updating ', len(document_batch), ' embeddings')

        doc_updates = []
        for doc, emb in zip(document_batch, embeddings):
            update = {
                "_op_type": "update",
                "_index": index,
                "_id": doc.id,
                "doc": {
                    document_store.embedding_field: emb.tolist()
                },
            }
            doc_updates.append(update)

        bulk(document_store.client,
             doc_updates,
             request_timeout=300,
             refresh=document_store.refresh_type)
コード例 #4
0
    def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever
        :param index: Index name to update
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()")

        # TODO Index embeddings every X batches to avoid OOM for huge document collections
        docs = self.get_all_documents(index)
        logger.info(f"Updating embeddings for {len(docs)} docs ...")
        embeddings = retriever.embed_passages(docs)  # type: ignore
        assert len(docs) == len(embeddings)

        if embeddings[0].shape[0] != self.embedding_dim:
            raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
                               f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
                               "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()")
        doc_updates = []
        for doc, emb in zip(docs, embeddings):
            update = {"_op_type": "update",
                      "_index": index,
                      "_id": doc.id,
                      "doc": {self.embedding_field: emb.tolist()},
                      }
            doc_updates.append(update)

        bulk(self.client, doc_updates, request_timeout=300)
コード例 #5
0
    def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: Index name to update
        :return: None
        """
        # Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created.
        faiss_index = self._create_new_index(vector_size=self.vector_size)
        index = index or self.index

        documents = self.get_all_documents(index=index)
        for doc in documents:
            embedding = retriever.embed_passages([doc.text])[0]  # type: ignore
            doc.embedding = embedding

        phi = self._get_phi(documents)

        for i in range(0, len(documents), self.index_buffer_size):
            embeddings = [doc.embedding for doc in documents[i : i + self.index_buffer_size]]
            hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi)
            faiss_index.add(hnsw_vectors)

        doc_meta_to_update = []
        for vector_id, doc in enumerate(documents[i : i + self.index_buffer_size]):
            meta = doc.meta or {}
            meta["vector_id"] = vector_id
            doc_meta_to_update.append((doc.id, meta))

        for doc_id, meta in doc_meta_to_update:
            super(FAISSDocumentStore, self).update_document_meta(id=doc_id, meta=meta)

        self.faiss_index = faiss_index
コード例 #6
0
    def update_embeddings(
        self,
        retriever: BaseRetriever,
        index: Optional[str] = None,
        update_existing_embeddings: bool = True,
        filters: Optional[Dict[str, List[str]]] = None,
        batch_size: int = 10_000
    ):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
        :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False,
                                           only documents without embeddings are processed. This mode can be used for
                                           incremental updating of embeddings, wherein, only newly indexed documents
                                           get processed.
        :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """

        index = index or self.index
        if not self.faiss_indexes.get(index):
            raise ValueError("Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ...")

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")
        vector_id = self.faiss_indexes[index].ntotal

        result = self._query(
            index=index,
            vector_ids=None,
            batch_size=batch_size,
            filters=filters,
            only_documents_without_embedding=not update_existing_embeddings
        )
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                embeddings = retriever.embed_passages(document_batch)  # type: ignore
                assert len(document_batch) == len(embeddings)

                embeddings_to_index = np.array(embeddings, dtype="float32")
                self.faiss_indexes[index].add(embeddings_to_index)

                vector_id_map = {}
                for doc in document_batch:
                    vector_id_map[doc.id] = vector_id
                    vector_id += 1
                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()
コード例 #7
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")

        result = self.get_all_documents_generator(index=index,
                                                  batch_size=batch_size,
                                                  return_embedding=False)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count) as progress_bar:
            for document_batch in batched_documents:
                self._delete_vector_ids_from_milvus(documents=document_batch,
                                                    index=index)

                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                embeddings_list = [
                    embedding.tolist() for embedding in embeddings
                ]
                assert len(document_batch) == len(embeddings_list)

                status, vector_ids = self.milvus_server.insert(
                    collection_name=index, records=embeddings_list)
                if status.code != Status.SUCCESS:
                    raise RuntimeError(
                        f'Vector embedding insertion failed: {status}')

                vector_id_map = {}
                for vector_id, doc in zip(vector_ids, document_batch):
                    vector_id_map[doc.id] = vector_id

                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        self.milvus_server.compact(collection_name=index)
コード例 #8
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to update the embeddings.
        :param index: Index name to update
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError(
                "Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()"
            )

        logger.info(
            f"Updating embeddings for {self.get_document_count(index=index)} docs ..."
        )

        result = self.get_all_documents_generator(index, batch_size=batch_size)
        for document_batch in get_batches_from_generator(result, batch_size):
            if len(document_batch) == 0:
                break
            embeddings = retriever.embed_passages(
                document_batch)  # type: ignore
            assert len(document_batch) == len(embeddings)

            if embeddings[0].shape[0] != self.embedding_dim:
                raise RuntimeError(
                    f"Embedding dim. of model ({embeddings[0].shape[0]})"
                    f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                    "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()"
                )
            doc_updates = []
            for doc, emb in zip(document_batch, embeddings):
                update = {
                    "_op_type": "update",
                    "_index": index,
                    "_id": doc.id,
                    "doc": {
                        self.embedding_field: emb.tolist()
                    },
                }
                doc_updates.append(update)

            bulk(self.client,
                 doc_updates,
                 request_timeout=300,
                 refresh=self.refresh_type)
コード例 #9
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if not self.faiss_index:
            raise ValueError(
                "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..."
            )

        # Faiss does not support update in existing index data so clear all existing data in it
        self.faiss_index.reset()
        self.reset_vector_ids(index=index)

        index = index or self.index

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")
        vector_id = self.faiss_index.ntotal

        result = self.get_all_documents_generator(index=index,
                                                  batch_size=batch_size,
                                                  return_embedding=False)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count,
                  disable=self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                assert len(document_batch) == len(embeddings)

                embeddings_to_index = np.array(embeddings, dtype="float32")
                self.faiss_index.add(embeddings_to_index)

                vector_id_map = {}
                for doc in document_batch:
                    vector_id_map[doc.id] = vector_id
                    vector_id += 1
                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()
コード例 #10
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :return: None
        """
        if not self.faiss_index:
            raise ValueError(
                "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..."
            )

        # Faiss does not support update in existing index data so clear all existing data in it
        self.faiss_index.reset()

        index = index or self.index
        documents = self.get_all_documents(index=index)

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        # To clear out the FAISS index contents and frees all memory immediately that is in use by the index
        self.faiss_index.reset()

        logger.info(f"Updating embeddings for {len(documents)} docs...")
        embeddings = retriever.embed_passages(documents)  # type: ignore
        assert len(documents) == len(embeddings)
        for i, doc in enumerate(documents):
            doc.embedding = embeddings[i]

        logger.info("Indexing embeddings and updating vectors_ids...")
        for i in tqdm(range(0, len(documents), self.index_buffer_size)):
            vector_id_map = {}
            vector_id = self.faiss_index.ntotal
            embeddings = [
                doc.embedding
                for doc in documents[i:i + self.index_buffer_size]
            ]
            embeddings = np.array(embeddings, dtype="float32")
            self.faiss_index.add(embeddings)

            for doc in documents[i:i + self.index_buffer_size]:
                vector_id_map[doc.id] = vector_id
                vector_id += 1
            self.update_vector_ids(vector_id_map, index=index)
コード例 #11
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: Index name to update
        :return: None
        """
        # Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created.
        faiss_index = self._create_new_index(vector_size=self.vector_size)
        index = index or self.index

        documents = self.get_all_documents(index=index)
        logger.info(f"Updating embeddings for {len(documents)} docs ...")
        embeddings = retriever.embed_passages(documents)  # type: ignore
        assert len(documents) == len(embeddings)
        for i, doc in enumerate(documents):
            doc.embedding = embeddings[i]

        phi = self._get_phi(documents)

        vector_id_map = {}
        for i in range(0, len(documents), self.index_buffer_size):
            vector_id = faiss_index.ntotal
            embeddings = [
                doc.embedding
                for doc in documents[i:i + self.index_buffer_size]
            ]
            hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings,
                                                  phi=phi)
            hnsw_vectors = hnsw_vectors.astype(np.float32)
            faiss_index.add(hnsw_vectors)

            for doc in documents[i:i + self.index_buffer_size]:
                vector_id_map[doc.id] = vector_id
                vector_id += 1

        self.update_vector_ids(vector_id_map, index=index)
        self.faiss_index = faiss_index
コード例 #12
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :return: None
        """
        # To clear out the FAISS index contents and frees all memory immediately that is in use by the index
        self.faiss_index.reset()

        index = index or self.index

        documents = self.get_all_documents(index=index)
        logger.info(f"Updating embeddings for {len(documents)} docs ...")
        embeddings = retriever.embed_passages(documents)  # type: ignore
        assert len(documents) == len(embeddings)
        for i, doc in enumerate(documents):
            doc.embedding = embeddings[i]

        vector_id_map = {}
        for i in range(0, len(documents), self.index_buffer_size):
            vector_id = self.faiss_index.ntotal
            embeddings = [
                doc.embedding
                for doc in documents[i:i + self.index_buffer_size]
            ]
            embeddings = np.array(embeddings, dtype="float32")
            self.faiss_index.add(embeddings)

            for doc in documents[i:i + self.index_buffer_size]:
                vector_id_map[doc.id] = vector_id
                vector_id += 1

        self.update_vector_ids(vector_id_map, index=index)