コード例 #1
0
ファイル: faiss.py プロジェクト: StatStud/haystack
    def update_embeddings(
        self,
        retriever: BaseRetriever,
        index: Optional[str] = None,
        update_existing_embeddings: bool = True,
        filters: Optional[Dict[str, List[str]]] = None,
        embeddings_to_index = None,
        batch_size: int = 10_000
    ):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
        :param retriever: Retriever to use to get embeddings for text
        :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
        :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False,
                                           only documents without embeddings are processed. This mode can be used for
                                           incremental updating of embeddings, wherein, only newly indexed documents
                                           get processed.
        :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """

        index = index or self.index
        if not self.faiss_indexes.get(index):
            raise ValueError("Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ...")

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")
        vector_id = self.faiss_indexes[index].ntotal

        result = self._query(
            index=index,
            vector_ids=None,
            batch_size=batch_size,
            filters=filters,
            only_documents_without_embedding=not update_existing_embeddings
        )
        
        
        batched_documents = get_batches_from_generator(result, batch_size)
        
        self.faiss_indexes[index].add(embeddings_to_index)
        
        
        with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:

                vector_id_map = {}
                for doc in document_batch:
                    vector_id_map[doc.id] = vector_id
                    vector_id += 1
                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()
コード例 #2
0
    def update_embeddings(
        self,
        retriever,
        index: Optional[str] = None,
        filters: Optional[Dict[str, List[str]]] = None,
        update_existing_embeddings: bool = True,
        batch_size: int = 10_000
    ):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to update the embeddings.
        :param index: Index name to update
        :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False,
                                           only documents without embeddings are processed. This mode can be used for
                                           incremental updating of embeddings, wherein, only newly indexed documents
                                           get processed.
        :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()")

        if update_existing_embeddings:
            logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...")
        else:
            logger.info(f"Updating embeddings for new docs without embeddings ...")

        result = self._get_all_documents_in_index(
            index=index,
            filters=filters,
            batch_size=batch_size,
            only_documents_without_embedding=not update_existing_embeddings
        )

        for result_batch in get_batches_from_generator(result, batch_size):
            document_batch = [self._convert_es_hit_to_document(hit, return_embedding=False) for hit in result_batch]
            embeddings = retriever.embed_passages(document_batch)  # type: ignore
            assert len(document_batch) == len(embeddings)

            if embeddings[0].shape[0] != self.embedding_dim:
                raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
                                   f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                                   "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()")
            doc_updates = []
            for doc, emb in zip(document_batch, embeddings):
                update = {"_op_type": "update",
                          "_index": index,
                          "_id": doc.id,
                          "doc": {self.embedding_field: emb.tolist()},
                          }
                doc_updates.append(update)

            bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)
コード例 #3
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")

        result = self.get_all_documents_generator(index=index,
                                                  batch_size=batch_size,
                                                  return_embedding=False)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count) as progress_bar:
            for document_batch in batched_documents:
                self._delete_vector_ids_from_milvus(documents=document_batch,
                                                    index=index)

                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                embeddings_list = [
                    embedding.tolist() for embedding in embeddings
                ]
                assert len(document_batch) == len(embeddings_list)

                status, vector_ids = self.milvus_server.insert(
                    collection_name=index, records=embeddings_list)
                if status.code != Status.SUCCESS:
                    raise RuntimeError(
                        f'Vector embedding insertion failed: {status}')

                vector_id_map = {}
                for vector_id, doc in zip(vector_ids, document_batch):
                    vector_id_map[doc.id] = vector_id

                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        self.milvus_server.compact(collection_name=index)
コード例 #4
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to update the embeddings.
        :param index: Index name to update
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError(
                "Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()"
            )

        logger.info(
            f"Updating embeddings for {self.get_document_count(index=index)} docs ..."
        )

        result = self.get_all_documents_generator(index, batch_size=batch_size)
        for document_batch in get_batches_from_generator(result, batch_size):
            if len(document_batch) == 0:
                break
            embeddings = retriever.embed_passages(
                document_batch)  # type: ignore
            assert len(document_batch) == len(embeddings)

            if embeddings[0].shape[0] != self.embedding_dim:
                raise RuntimeError(
                    f"Embedding dim. of model ({embeddings[0].shape[0]})"
                    f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                    "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()"
                )
            doc_updates = []
            for doc, emb in zip(document_batch, embeddings):
                update = {
                    "_op_type": "update",
                    "_index": index,
                    "_id": doc.id,
                    "doc": {
                        self.embedding_field: emb.tolist()
                    },
                }
                doc_updates.append(update)

            bulk(self.client,
                 doc_updates,
                 request_timeout=300,
                 refresh=self.refresh_type)
コード例 #5
0
    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if not self.faiss_index:
            raise ValueError(
                "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..."
            )

        # Faiss does not support update in existing index data so clear all existing data in it
        self.faiss_index.reset()
        self.reset_vector_ids(index=index)

        index = index or self.index

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")
        vector_id = self.faiss_index.ntotal

        result = self.get_all_documents_generator(index=index,
                                                  batch_size=batch_size,
                                                  return_embedding=False)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count,
                  disable=self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                assert len(document_batch) == len(embeddings)

                embeddings_to_index = np.array(embeddings, dtype="float32")
                self.faiss_index.add(embeddings_to_index)

                vector_id_map = {}
                for doc in document_batch:
                    vector_id_map[doc.id] = vector_id
                    vector_id += 1
                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()
コード例 #6
0
    def update_embeddings(
        self,
        retriever,
        index: Optional[str] = None,
        filters: Optional[Dict[str, List[str]]] = None,
        update_existing_embeddings: bool = True,
        batch_size: int = 10_000
    ):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to update the embeddings.
        :param index: Index name to update
        :param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself.
        This option must be always true for weaviate and it will update the embeddings for all the documents.
        :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError("Specify the arg `embedding_field` when initializing WeaviateDocumentStore()")

        if update_existing_embeddings:
            logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...")
        else:
            raise RuntimeError("All the documents in Weaviate store have an embedding by default. Only update is allowed!")

        result = self._get_all_documents_in_index(
            index=index,
            filters=filters,
            batch_size=batch_size,
        )

        for result_batch in get_batches_from_generator(result, batch_size):
            document_batch = [self._convert_weaviate_result_to_document(hit, return_embedding=False) for hit in result_batch]
            embeddings = retriever.embed_passages(document_batch)  # type: ignore
            assert len(document_batch) == len(embeddings)

            if embeddings[0].shape[0] != self.embedding_dim:
                raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
                                   f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                                   "Specify the arg `embedding_dim` when initializing WeaviateDocumentStore()")
            for doc, emb in zip(document_batch, embeddings):
                # Using update method to only update the embeddings, other properties will be in tact
                self.weaviate_client.data_object.update({}, class_name=index, uuid=doc.id, vector=emb)
コード例 #7
0
ファイル: memory.py プロジェクト: walexi/haystack
class InMemoryDocumentStore(BaseDocumentStore):
    """
        In-memory document store
    """

    def __init__(
        self,
        index: str = "document",
        label_index: str = "label",
        embedding_field: Optional[str] = "embedding",
        embedding_dim: int = 768,
        return_embedding: bool = False,
        similarity: str = "dot_product",
        progress_bar: bool = True,
    ):
        """
        :param index: The documents are scoped to an index attribute that can be used when writing, querying,
                      or deleting documents. This parameter sets the default value for document index.
        :param label_index: The default value of index attribute for the labels.
        :param embedding_field: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
        :param embedding_dim: The size of the embedding vector.
        :param return_embedding: To return document embedding
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
                   more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """

        # save init parameters to enable export of component config as YAML
        self.set_config(
            index=index, label_index=label_index, embedding_field=embedding_field, embedding_dim=embedding_dim,
            return_embedding=return_embedding, similarity=similarity, progress_bar=progress_bar,
        )

        self.indexes: Dict[str, Dict] = defaultdict(dict)
        self.index: str = index
        self.label_index: str = label_index
        self.embedding_field = embedding_field
        self.embedding_dim = embedding_dim
        self.return_embedding = return_embedding
        self.similarity = similarity
        self.progress_bar = progress_bar

    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        """
        Indexes documents for later queries.


       :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta": {"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                      separate index than the documents for search.
        :return: None
        """
        index = index or self.index

        field_map = self._create_document_field_map()
        documents = deepcopy(documents)
        documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]

        for document in documents_objects:
            self.indexes[index][document.id] = document

    def _create_document_field_map(self):
        return {
            self.embedding_field: "embedding",
        }

    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at
            self.indexes[index][label_id] = label

    def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
        """Fetch a document by specifying its text id string"""
        index = index or self.index
        documents = self.get_documents_by_id([id], index=index)
        if documents:
            return documents[0]
        else:
            return None

    def get_documents_by_id(self, ids: List[str], index: Optional[str] = None) -> List[Document]:
        """Fetch documents by specifying a list of text id strings"""
        index = index or self.index
        documents = [self.indexes[index][id] for id in ids]
        return documents

    def query_by_embedding(self,
                           query_emb: np.ndarray,
                           filters: Optional[Dict[str, List[str]]] = None,
                           top_k: int = 10,
                           index: Optional[str] = None,
                           return_embedding: Optional[bool] = None) -> List[Document]:

        """
        Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.

        :param query_emb: Embedding of the query (e.g. gathered from DPR)
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param top_k: How many documents to return
        :param index: Index name for storing the docs and metadata
        :param return_embedding: To return document embedding
        :return:
        """

        from numpy import dot
        from numpy.linalg import norm

        index = index or self.index
        if return_embedding is None:
            return_embedding = self.return_embedding

        if query_emb is None:
            return []

        document_to_search = self.get_all_documents(index=index, filters=filters, return_embedding=True)
        candidate_docs = []
        for doc in document_to_search:
            curr_meta = deepcopy(doc.meta)
            new_document = Document(
                id=doc.id,
                text=doc.text,
                meta=curr_meta,
                embedding=doc.embedding
            )
            new_document.embedding = doc.embedding if return_embedding is True else None

            if self.similarity == "dot_product":
                score = dot(query_emb, doc.embedding) / (
                    norm(query_emb) * norm(doc.embedding)
                )
            elif self.similarity == "cosine":
                # cosine similarity score = 1 - cosine distance
                score = 1 - cosine(query_emb, doc.embedding)
            new_document.score = score
            new_document.probability = (score + 1) / 2
            candidate_docs.append(new_document)

        return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k]

    def update_embeddings(
        self,
        retriever: BaseRetriever,
        index: Optional[str] = None,
        filters: Optional[Dict[str, List[str]]] = None,
        update_existing_embeddings: bool = True,
        batch_size: int = 10_000,
    ):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
        :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False,
                                           only documents without embeddings are processed. This mode can be used for
                                           incremental updating of embeddings, wherein, only newly indexed documents
                                           get processed.
        :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        if index is None:
            index = self.index

        if not self.embedding_field:
            raise RuntimeError("Specify the arg embedding_field when initializing InMemoryDocumentStore()")

        # TODO Index embeddings every X batches to avoid OOM for huge document collections
        result = self._query(
            index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings
        )
        document_count = len(result)
        logger.info(f"Updating embeddings for {document_count} docs ...")
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                embeddings = retriever.embed_passages(document_batch)  # type: ignore
                assert len(document_batch) == len(embeddings)

                if embeddings[0].shape[0] != self.embedding_dim:
                    raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
                                       f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                                       "Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()")

                for doc, emb in zip(document_batch, embeddings):
                    self.indexes[index][doc.id].embedding = emb
コード例 #8
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                                  them right away in Milvus. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return:
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)
        field_map = self._create_document_field_map()

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.write_documents() with empty list")
            return

        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        batched_documents = get_batches_from_generator(document_objects,
                                                       batch_size)
        with tqdm(total=len(document_objects),
                  disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                vector_ids = []
                if add_vectors:
                    doc_ids = []
                    embeddings = []
                    for doc in document_batch:
                        doc_ids.append(doc.id)
                        if isinstance(doc.embedding, np.ndarray):
                            embeddings.append(doc.embedding.tolist())
                        elif isinstance(doc.embedding, list):
                            embeddings.append(doc.embedding)
                        else:
                            raise AttributeError(
                                f'Format of supplied document embedding {type(doc.embedding)} is not '
                                f'supported. Please use list or numpy.ndarray')

                    if self.update_existing_documents:
                        existing_docs = super().get_documents_by_id(
                            ids=doc_ids, index=index)
                        self._delete_vector_ids_from_milvus(
                            documents=existing_docs, index=index)

                    status, vector_ids = self.milvus_server.insert(
                        collection_name=index, records=embeddings)
                    if status.code != Status.SUCCESS:
                        raise RuntimeError(
                            f'Vector embedding insertion failed: {status}')

                docs_to_write_in_sql = []
                for idx, doc in enumerate(document_batch):
                    meta = doc.meta
                    if add_vectors:
                        meta["vector_id"] = vector_ids[idx]
                    docs_to_write_in_sql.append(doc)

                super().write_documents(docs_to_write_in_sql, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        if self.update_existing_documents:
            self.milvus_server.compact(collection_name=index)
コード例 #9
0
        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")

        result = self._query(
            index=index,
            vector_ids=None,
            batch_size=batch_size,
            filters=filters,
            only_documents_without_embedding=not update_existing_embeddings)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count,
                  disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                self._delete_vector_ids_from_milvus(documents=document_batch,
                                                    index=index)

                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                embeddings_list = [
                    embedding.tolist() for embedding in embeddings
                ]
                assert len(document_batch) == len(embeddings_list)

                status, vector_ids = self.milvus_server.insert(
                    collection_name=index, records=embeddings_list)
コード例 #10
0
        field_map = self._create_document_field_map()

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.write_documents() with empty list")
            return

        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]
        document_objects = self._handle_duplicate_documents(
            document_objects, duplicate_documents)
        add_vectors = False if document_objects[0].embedding is None else True

        batched_documents = get_batches_from_generator(document_objects,
                                                       batch_size)
        with tqdm(total=len(document_objects),
                  disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                vector_ids = []
                if add_vectors:
                    doc_ids = []
                    embeddings = []
                    for doc in document_batch:
                        doc_ids.append(doc.id)
                        if isinstance(doc.embedding, np.ndarray):
                            embeddings.append(doc.embedding.tolist())
                        elif isinstance(doc.embedding, list):
                            embeddings.append(doc.embedding)
                        else:
                            raise AttributeError(