def update_embeddings( self, retriever: BaseRetriever, index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None, embeddings_to_index = None, batch_size: int = 10_000 ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used. :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, only documents without embeddings are processed. This mode can be used for incremental updating of embeddings, wherein, only newly indexed documents get processed. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. Example: {"name": ["some", "more"], "category": ["only_one"]} :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ index = index or self.index if not self.faiss_indexes.get(index): raise ValueError("Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ...") document_count = self.get_document_count(index=index) if document_count == 0: logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") vector_id = self.faiss_indexes[index].ntotal result = self._query( index=index, vector_ids=None, batch_size=batch_size, filters=filters, only_documents_without_embedding=not update_existing_embeddings ) batched_documents = get_batches_from_generator(result, batch_size) self.faiss_indexes[index].add(embeddings_to_index) with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: vector_id_map = {} for doc in document_batch: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close()
def update_embeddings( self, retriever, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000 ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to update the embeddings. :param index: Index name to update :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, only documents without embeddings are processed. This mode can be used for incremental updating of embeddings, wherein, only newly indexed documents get processed. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. Example: {"name": ["some", "more"], "category": ["only_one"]} :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()") if update_existing_embeddings: logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...") else: logger.info(f"Updating embeddings for new docs without embeddings ...") result = self._get_all_documents_in_index( index=index, filters=filters, batch_size=batch_size, only_documents_without_embedding=not update_existing_embeddings ) for result_batch in get_batches_from_generator(result, batch_size): document_batch = [self._convert_es_hit_to_document(hit, return_embedding=False) for hit in result_batch] embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()") doc_updates = [] for doc, emb in zip(document_batch, embeddings): update = {"_op_type": "update", "_index": index, "_id": doc.id, "doc": {self.embedding_field: emb.tolist()}, } doc_updates.append(update) bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ index = index or self.index self._create_collection_and_index_if_not_exist(index) document_count = self.get_document_count(index=index) if document_count == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") result = self.get_all_documents_generator(index=index, batch_size=batch_size, return_embedding=False) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count) as progress_bar: for document_batch in batched_documents: self._delete_vector_ids_from_milvus(documents=document_batch, index=index) embeddings = retriever.embed_passages( document_batch) # type: ignore embeddings_list = [ embedding.tolist() for embedding in embeddings ] assert len(document_batch) == len(embeddings_list) status, vector_ids = self.milvus_server.insert( collection_name=index, records=embeddings_list) if status.code != Status.SUCCESS: raise RuntimeError( f'Vector embedding insertion failed: {status}') vector_id_map = {} for vector_id, doc in zip(vector_ids, document_batch): vector_id_map[doc.id] = vector_id self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close() self.milvus_server.flush([index]) self.milvus_server.compact(collection_name=index)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to update the embeddings. :param index: Index name to update :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError( "Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()" ) logger.info( f"Updating embeddings for {self.get_document_count(index=index)} docs ..." ) result = self.get_all_documents_generator(index, batch_size=batch_size) for document_batch in get_batches_from_generator(result, batch_size): if len(document_batch) == 0: break embeddings = retriever.embed_passages( document_batch) # type: ignore assert len(document_batch) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError( f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()" ) doc_updates = [] for doc, emb in zip(document_batch, embeddings): update = { "_op_type": "update", "_index": index, "_id": doc.id, "doc": { self.embedding_field: emb.tolist() }, } doc_updates.append(update) bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if not self.faiss_index: raise ValueError( "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..." ) # Faiss does not support update in existing index data so clear all existing data in it self.faiss_index.reset() self.reset_vector_ids(index=index) index = index or self.index document_count = self.get_document_count(index=index) if document_count == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") vector_id = self.faiss_index.ntotal result = self.get_all_documents_generator(index=index, batch_size=batch_size, return_embedding=False) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count, disable=self.progress_bar) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_passages( document_batch) # type: ignore assert len(document_batch) == len(embeddings) embeddings_to_index = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings_to_index) vector_id_map = {} for doc in document_batch: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close()
def update_embeddings( self, retriever, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000 ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to update the embeddings. :param index: Index name to update :param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself. This option must be always true for weaviate and it will update the embeddings for all the documents. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. Example: {"name": ["some", "more"], "category": ["only_one"]} :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError("Specify the arg `embedding_field` when initializing WeaviateDocumentStore()") if update_existing_embeddings: logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...") else: raise RuntimeError("All the documents in Weaviate store have an embedding by default. Only update is allowed!") result = self._get_all_documents_in_index( index=index, filters=filters, batch_size=batch_size, ) for result_batch in get_batches_from_generator(result, batch_size): document_batch = [self._convert_weaviate_result_to_document(hit, return_embedding=False) for hit in result_batch] embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing WeaviateDocumentStore()") for doc, emb in zip(document_batch, embeddings): # Using update method to only update the embeddings, other properties will be in tact self.weaviate_client.data_object.update({}, class_name=index, uuid=doc.id, vector=emb)
class InMemoryDocumentStore(BaseDocumentStore): """ In-memory document store """ def __init__( self, index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, ): """ :param index: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents. This parameter sets the default value for document index. :param label_index: The default value of index attribute for the labels. :param embedding_field: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) :param embedding_dim: The size of the embedding vector. :param return_embedding: To return document embedding :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. """ # save init parameters to enable export of component config as YAML self.set_config( index=index, label_index=label_index, embedding_field=embedding_field, embedding_dim=embedding_dim, return_embedding=return_embedding, similarity=similarity, progress_bar=progress_bar, ) self.indexes: Dict[str, Dict] = defaultdict(dict) self.index: str = index self.label_index: str = label_index self.embedding_field = embedding_field self.embedding_dim = embedding_dim self.return_embedding = return_embedding self.similarity = similarity self.progress_bar = progress_bar def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta": {"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ index = index or self.index field_map = self._create_document_field_map() documents = deepcopy(documents) documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] for document in documents_objects: self.indexes[index][document.id] = document def _create_document_field_map(self): return { self.embedding_field: "embedding", } def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at self.indexes[index][label_id] = label def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]: """Fetch a document by specifying its text id string""" index = index or self.index documents = self.get_documents_by_id([id], index=index) if documents: return documents[0] else: return None def get_documents_by_id(self, ids: List[str], index: Optional[str] = None) -> List[Document]: """Fetch documents by specifying a list of text id strings""" index = index or self.index documents = [self.indexes[index][id] for id in ids] return documents def query_by_embedding(self, query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]: """ Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. :param query_emb: Embedding of the query (e.g. gathered from DPR) :param filters: Optional filters to narrow down the search space. Example: {"name": ["some", "more"], "category": ["only_one"]} :param top_k: How many documents to return :param index: Index name for storing the docs and metadata :param return_embedding: To return document embedding :return: """ from numpy import dot from numpy.linalg import norm index = index or self.index if return_embedding is None: return_embedding = self.return_embedding if query_emb is None: return [] document_to_search = self.get_all_documents(index=index, filters=filters, return_embedding=True) candidate_docs = [] for doc in document_to_search: curr_meta = deepcopy(doc.meta) new_document = Document( id=doc.id, text=doc.text, meta=curr_meta, embedding=doc.embedding ) new_document.embedding = doc.embedding if return_embedding is True else None if self.similarity == "dot_product": score = dot(query_emb, doc.embedding) / ( norm(query_emb) * norm(doc.embedding) ) elif self.similarity == "cosine": # cosine similarity score = 1 - cosine distance score = 1 - cosine(query_emb, doc.embedding) new_document.score = score new_document.probability = (score + 1) / 2 candidate_docs.append(new_document) return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k] def update_embeddings( self, retriever: BaseRetriever, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used. :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, only documents without embeddings are processed. This mode can be used for incremental updating of embeddings, wherein, only newly indexed documents get processed. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. Example: {"name": ["some", "more"], "category": ["only_one"]} :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError("Specify the arg embedding_field when initializing InMemoryDocumentStore()") # TODO Index embeddings every X batches to avoid OOM for huge document collections result = self._query( index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings ) document_count = len(result) logger.info(f"Updating embeddings for {document_count} docs ...") batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()") for doc, emb in zip(document_batch, embeddings): self.indexes[index][doc.id].embedding = emb
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000): """ Add new documents to the DocumentStore. :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index them right away in Milvus. If not, you can later call update_embeddings() to create & index them. :param index: (SQL) index name for storing the docs and metadata :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: """ index = index or self.index self._create_collection_and_index_if_not_exist(index) field_map = self._create_document_field_map() if len(documents) == 0: logger.warning( "Calling DocumentStore.write_documents() with empty list") return document_objects = [ Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents ] add_vectors = False if document_objects[0].embedding is None else True batched_documents = get_batches_from_generator(document_objects, batch_size) with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: vector_ids = [] if add_vectors: doc_ids = [] embeddings = [] for doc in document_batch: doc_ids.append(doc.id) if isinstance(doc.embedding, np.ndarray): embeddings.append(doc.embedding.tolist()) elif isinstance(doc.embedding, list): embeddings.append(doc.embedding) else: raise AttributeError( f'Format of supplied document embedding {type(doc.embedding)} is not ' f'supported. Please use list or numpy.ndarray') if self.update_existing_documents: existing_docs = super().get_documents_by_id( ids=doc_ids, index=index) self._delete_vector_ids_from_milvus( documents=existing_docs, index=index) status, vector_ids = self.milvus_server.insert( collection_name=index, records=embeddings) if status.code != Status.SUCCESS: raise RuntimeError( f'Vector embedding insertion failed: {status}') docs_to_write_in_sql = [] for idx, doc in enumerate(document_batch): meta = doc.meta if add_vectors: meta["vector_id"] = vector_ids[idx] docs_to_write_in_sql.append(doc) super().write_documents(docs_to_write_in_sql, index=index) progress_bar.update(batch_size) progress_bar.close() self.milvus_server.flush([index]) if self.update_existing_documents: self.milvus_server.compact(collection_name=index)
document_count = self.get_document_count(index=index) if document_count == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") result = self._query( index=index, vector_ids=None, batch_size=batch_size, filters=filters, only_documents_without_embedding=not update_existing_embeddings) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: self._delete_vector_ids_from_milvus(documents=document_batch, index=index) embeddings = retriever.embed_passages( document_batch) # type: ignore embeddings_list = [ embedding.tolist() for embedding in embeddings ] assert len(document_batch) == len(embeddings_list) status, vector_ids = self.milvus_server.insert( collection_name=index, records=embeddings_list)
field_map = self._create_document_field_map() if len(documents) == 0: logger.warning( "Calling DocumentStore.write_documents() with empty list") return document_objects = [ Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents ] document_objects = self._handle_duplicate_documents( document_objects, duplicate_documents) add_vectors = False if document_objects[0].embedding is None else True batched_documents = get_batches_from_generator(document_objects, batch_size) with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: vector_ids = [] if add_vectors: doc_ids = [] embeddings = [] for doc in document_batch: doc_ids.append(doc.id) if isinstance(doc.embedding, np.ndarray): embeddings.append(doc.embedding.tolist()) elif isinstance(doc.embedding, list): embeddings.append(doc.embedding) else: raise AttributeError(