def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ # Make sure we comply to Document class format document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] index = index or self.index for doc in document_objects: meta_fields = doc.meta or {} vector_id = meta_fields.get("vector_id") meta_orms = [ MetaORM(name=key, value=value) for key, value in meta_fields.items() ] doc_orm = DocumentORM(id=doc.id, text=doc.text, vector_id=vector_id, meta=meta_orms, index=index) self.session.add(doc_orm) self.session.commit()
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries in Elasticsearch. When using explicit document IDs, any existing document with the same ID gets updated. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field. :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. :return: None """ if index and not self.client.indices.exists(index=index): self._create_document_index(index) if index is None: index = self.index # Make sure we comply to Document class format documents_objects = [Document.from_dict(d, field_map=self._create_document_field_map()) if isinstance(d, dict) else d for d in documents] documents_to_index = [] for doc in documents_objects: _doc = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **doc.to_dict(field_map=self._create_document_field_map()) } # type: Dict[str, Any] # cast embedding type as ES cannot deal with np.array if _doc[self.embedding_field] is not None: if type(_doc[self.embedding_field]) == np.ndarray: _doc[self.embedding_field] = _doc[self.embedding_field].tolist() # rename id for elastic _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields _ = _doc.pop("score", None) _ = _doc.pop("probability", None) _doc = {k:v for k,v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, # we "unnest" all value within "meta" if "meta" in _doc.keys(): for k, v in _doc["meta"].items(): _doc[k] = v _doc.pop("meta") documents_to_index.append(_doc) bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
def train_index(self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None): """ Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. The train vectors should come from the same distribution as your final ones. You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. :param documents: Documents (incl. the embeddings) :param embeddings: Plain embeddings :return: None """ if embeddings and documents: raise ValueError( "Either pass `documents` or `embeddings`. You passed both.") if documents: document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] embeddings = [doc.embedding for doc in document_objects] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.train(embeddings)