def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever :param index: Index name to update :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()") docs = self.get_all_documents(index) passages = [d.text for d in docs] #TODO Index embeddings every X batches to avoid OOM for huge document collections logger.info(f"Updating embeddings for {len(passages)} docs ...") # TODO send whole Document to retriever and let retriever decide what fields to embed from haystack.retriever.dense import DensePassageRetriever if isinstance(retriever,DensePassageRetriever): titles = [] for d in docs: if d.meta is not None: titles.append(d.meta['name'] if 'name' in d.meta.keys() else None) if len(titles) == len(passages): embeddings = retriever.embed_passages(passages,titles) # type: ignore else: embeddings = retriever.embed_passages(passages) # type: ignore else: #EmbeddingRetriever embeddings = retriever.embed_passages(passages) # type: ignore assert len(docs) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in documentstore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()") doc_updates = [] for doc, emb in zip(docs, embeddings): update = {"_op_type": "update", "_index": index, "_id": doc.id, "doc": {self.embedding_field: emb.tolist()}, } doc_updates.append(update) bulk(self.client, doc_updates, request_timeout=300)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever :param index: Index name to update :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError( "Specify the arg embedding_field when initializing InMemoryDocumentStore()" ) # TODO Index embeddings every X batches to avoid OOM for huge document collections docs = self.get_all_documents(index) logger.info(f"Updating embeddings for {len(docs)} docs ...") embeddings = retriever.embed_passages(docs) # type: ignore assert len(docs) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError( f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()" ) for doc, emb in zip(docs, embeddings): self.indexes[index][doc.id].embedding = emb
def update_elastic_embeddings(document_store: ElasticsearchDocumentStore, retriever: BaseRetriever, update_existing=False): index = document_store.index result = document_store.get_all_documents_generator(index) for document_batch in get_batches_from_generator(result, 10_000): if len(document_batch) == 0: break if not update_existing: # take only documents with no embeddings document_batch = [d for d in document_batch if d.embedding is None] if len(document_batch) == 0: continue embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) print('updating ', len(document_batch), ' embeddings') doc_updates = [] for doc, emb in zip(document_batch, embeddings): update = { "_op_type": "update", "_index": index, "_id": doc.id, "doc": { document_store.embedding_field: emb.tolist() }, } doc_updates.append(update) bulk(document_store.client, doc_updates, request_timeout=300, refresh=document_store.refresh_type)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever :param index: Index name to update :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError("Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()") # TODO Index embeddings every X batches to avoid OOM for huge document collections docs = self.get_all_documents(index) logger.info(f"Updating embeddings for {len(docs)} docs ...") embeddings = retriever.embed_passages(docs) # type: ignore assert len(docs) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in documentstore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()") doc_updates = [] for doc, emb in zip(docs, embeddings): update = {"_op_type": "update", "_index": index, "_id": doc.id, "doc": {self.embedding_field: emb.tolist()}, } doc_updates.append(update) bulk(self.client, doc_updates, request_timeout=300)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: Index name to update :return: None """ # Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created. faiss_index = self._create_new_index(vector_size=self.vector_size) index = index or self.index documents = self.get_all_documents(index=index) for doc in documents: embedding = retriever.embed_passages([doc.text])[0] # type: ignore doc.embedding = embedding phi = self._get_phi(documents) for i in range(0, len(documents), self.index_buffer_size): embeddings = [doc.embedding for doc in documents[i : i + self.index_buffer_size]] hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi) faiss_index.add(hnsw_vectors) doc_meta_to_update = [] for vector_id, doc in enumerate(documents[i : i + self.index_buffer_size]): meta = doc.meta or {} meta["vector_id"] = vector_id doc_meta_to_update.append((doc.id, meta)) for doc_id, meta in doc_meta_to_update: super(FAISSDocumentStore, self).update_document_meta(id=doc_id, meta=meta) self.faiss_index = faiss_index
def update_embeddings( self, retriever: BaseRetriever, index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000 ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used. :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, only documents without embeddings are processed. This mode can be used for incremental updating of embeddings, wherein, only newly indexed documents get processed. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. Example: {"name": ["some", "more"], "category": ["only_one"]} :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ index = index or self.index if not self.faiss_indexes.get(index): raise ValueError("Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ...") document_count = self.get_document_count(index=index) if document_count == 0: logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") vector_id = self.faiss_indexes[index].ntotal result = self._query( index=index, vector_ids=None, batch_size=batch_size, filters=filters, only_documents_without_embedding=not update_existing_embeddings ) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count, disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) embeddings_to_index = np.array(embeddings, dtype="float32") self.faiss_indexes[index].add(embeddings_to_index) vector_id_map = {} for doc in document_batch: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close()
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ index = index or self.index self._create_collection_and_index_if_not_exist(index) document_count = self.get_document_count(index=index) if document_count == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") result = self.get_all_documents_generator(index=index, batch_size=batch_size, return_embedding=False) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count) as progress_bar: for document_batch in batched_documents: self._delete_vector_ids_from_milvus(documents=document_batch, index=index) embeddings = retriever.embed_passages( document_batch) # type: ignore embeddings_list = [ embedding.tolist() for embedding in embeddings ] assert len(document_batch) == len(embeddings_list) status, vector_ids = self.milvus_server.insert( collection_name=index, records=embeddings_list) if status.code != Status.SUCCESS: raise RuntimeError( f'Vector embedding insertion failed: {status}') vector_id_map = {} for vector_id, doc in zip(vector_ids, document_batch): vector_id_map[doc.id] = vector_id self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close() self.milvus_server.flush([index]) self.milvus_server.compact(collection_name=index)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to update the embeddings. :param index: Index name to update :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if index is None: index = self.index if not self.embedding_field: raise RuntimeError( "Specify the arg `embedding_field` when initializing ElasticsearchDocumentStore()" ) logger.info( f"Updating embeddings for {self.get_document_count(index=index)} docs ..." ) result = self.get_all_documents_generator(index, batch_size=batch_size) for document_batch in get_batches_from_generator(result, batch_size): if len(document_batch) == 0: break embeddings = retriever.embed_passages( document_batch) # type: ignore assert len(document_batch) == len(embeddings) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError( f"Embedding dim. of model ({embeddings[0].shape[0]})" f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})." "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()" ) doc_updates = [] for doc, emb in zip(document_batch, embeddings): update = { "_op_type": "update", "_index": index, "_id": doc.id, "doc": { self.embedding_field: emb.tolist() }, } doc_updates.append(update) bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ if not self.faiss_index: raise ValueError( "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..." ) # Faiss does not support update in existing index data so clear all existing data in it self.faiss_index.reset() self.reset_vector_ids(index=index) index = index or self.index document_count = self.get_document_count(index=index) if document_count == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return logger.info(f"Updating embeddings for {document_count} docs...") vector_id = self.faiss_index.ntotal result = self.get_all_documents_generator(index=index, batch_size=batch_size, return_embedding=False) batched_documents = get_batches_from_generator(result, batch_size) with tqdm(total=document_count, disable=self.progress_bar) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_passages( document_batch) # type: ignore assert len(document_batch) == len(embeddings) embeddings_to_index = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings_to_index) vector_id_map = {} for doc in document_batch: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index) progress_bar.update(batch_size) progress_bar.close()
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :return: None """ if not self.faiss_index: raise ValueError( "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..." ) # Faiss does not support update in existing index data so clear all existing data in it self.faiss_index.reset() index = index or self.index documents = self.get_all_documents(index=index) if len(documents) == 0: logger.warning( "Calling DocumentStore.update_embeddings() on an empty index") return # To clear out the FAISS index contents and frees all memory immediately that is in use by the index self.faiss_index.reset() logger.info(f"Updating embeddings for {len(documents)} docs...") embeddings = retriever.embed_passages(documents) # type: ignore assert len(documents) == len(embeddings) for i, doc in enumerate(documents): doc.embedding = embeddings[i] logger.info("Indexing embeddings and updating vectors_ids...") for i in tqdm(range(0, len(documents), self.index_buffer_size)): vector_id_map = {} vector_id = self.faiss_index.ntotal embeddings = [ doc.embedding for doc in documents[i:i + self.index_buffer_size] ] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings) for doc in documents[i:i + self.index_buffer_size]: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index)
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: Index name to update :return: None """ # Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created. faiss_index = self._create_new_index(vector_size=self.vector_size) index = index or self.index documents = self.get_all_documents(index=index) logger.info(f"Updating embeddings for {len(documents)} docs ...") embeddings = retriever.embed_passages(documents) # type: ignore assert len(documents) == len(embeddings) for i, doc in enumerate(documents): doc.embedding = embeddings[i] phi = self._get_phi(documents) vector_id_map = {} for i in range(0, len(documents), self.index_buffer_size): vector_id = faiss_index.ntotal embeddings = [ doc.embedding for doc in documents[i:i + self.index_buffer_size] ] hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi) hnsw_vectors = hnsw_vectors.astype(np.float32) faiss_index.add(hnsw_vectors) for doc in documents[i:i + self.index_buffer_size]: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index) self.faiss_index = faiss_index
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). :param retriever: Retriever to use to get embeddings for text :param index: (SQL) index name for storing the docs and metadata :return: None """ # To clear out the FAISS index contents and frees all memory immediately that is in use by the index self.faiss_index.reset() index = index or self.index documents = self.get_all_documents(index=index) logger.info(f"Updating embeddings for {len(documents)} docs ...") embeddings = retriever.embed_passages(documents) # type: ignore assert len(documents) == len(embeddings) for i, doc in enumerate(documents): doc.embedding = embeddings[i] vector_id_map = {} for i in range(0, len(documents), self.index_buffer_size): vector_id = self.faiss_index.ntotal embeddings = [ doc.embedding for doc in documents[i:i + self.index_buffer_size] ] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings) for doc in documents[i:i + self.index_buffer_size]: vector_id_map[doc.id] = vector_id vector_id += 1 self.update_vector_ids(vector_id_map, index=index)