def train_index( self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None, ): """ Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. The train vectors should come from the same distribution as your final ones. You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. :param documents: Documents (incl. the embeddings) :param embeddings: Plain embeddings :return: None """ if embeddings and documents: raise ValueError( "Either pass `documents` or `embeddings`. You passed both.") if documents: document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] embeddings = [doc.embedding for doc in document_objects] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.train(embeddings)
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ index = index or self.index if len(documents) == 0: return # Make sure we comply to Document class format if isinstance(documents[0], dict): document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] else: document_objects = documents for i in range(0, len(document_objects), self.batch_size): for doc in document_objects[i:i + self.batch_size]: meta_fields = doc.meta or {} vector_id = meta_fields.get("vector_id") meta_orms = [ MetaORM(name=key, value=value) for key, value in meta_fields.items() ] doc_orm = DocumentORM( id=doc.id, text=doc.text, vector_id=vector_id, meta=meta_orms, index=index, ) if self.update_existing_documents: # First old meta data cleaning is required self.session.query(MetaORM).filter_by( document_id=doc.id).delete() self.session.merge(doc_orm) else: self.session.add(doc_orm) try: self.session.commit() except Exception as ex: logger.error(f"Transaction rollback: {ex.__cause__}") # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail self.session.rollback() raise ex
def write_documents( self, documents: Union[List[dict], List[Document]], index: Optional[str] = None ): """ Add new documents to the DocumentStore. :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index them right away in FAISS. If not, you can later call update_embeddings() to create & index them. :param index: (SQL) index name for storing the docs and metadata :return: """ # vector index if not self.faiss_index: raise ValueError( "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..." ) # doc + metadata index index = index or self.index field_map = self._create_document_field_map() document_objects = [ Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents ] add_vectors = False if document_objects[0].embedding is None else True if self.update_existing_documents and add_vectors: logger.warning( "You have enabled `update_existing_documents` feature and " "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n" "Please call `update_embeddings` method to repopulate `faiss_index`" ) for i in range(0, len(document_objects), self.index_buffer_size): vector_id = self.faiss_index.ntotal if add_vectors: embeddings = [ doc.embedding for doc in document_objects[i : i + self.index_buffer_size] ] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings) docs_to_write_in_sql = [] for doc in document_objects[i : i + self.index_buffer_size]: # meta = doc.meta if add_vectors: # meta["vector_id"] = vector_id doc.vector_id = vector_id vector_id += 1 docs_to_write_in_sql.append(doc) super(FAISSDocumentStore, self).write_documents( docs_to_write_in_sql, index=index )