def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ # Make sure we comply to Document class format document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] index = index or self.index for doc in document_objects: meta_fields = doc.meta or {} meta_orms = [ MetaORM(name=key, value=value) for key, value in meta_fields.items() ] doc_orm = DocumentORM(id=doc.id, text=doc.text, meta=meta_orms, index=index) self.session.add(doc_orm) self.session.commit()
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta": {"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ index = index or self.index documents_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] for document in documents_objects: self.indexes[index][document.id] = document
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): if self.faiss_index is not None: raise Exception("Addition of more data in an existing index is not supported.") faiss_index = self._create_new_index(vector_size=self.vector_size) index = index or self.index document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents] add_vectors = False if document_objects[0].embedding is None else True if add_vectors: phi = self._get_phi(document_objects) for i in range(0, len(document_objects), self.index_buffer_size): if add_vectors: embeddings = [doc.embedding for doc in document_objects[i: i + self.index_buffer_size]] hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi) faiss_index.add(hnsw_vectors) docs_to_write_in_sql = [] for vector_id, doc in enumerate(document_objects[i : i + self.index_buffer_size]): meta = doc.meta if add_vectors: meta["vector_id"] = vector_id docs_to_write_in_sql.append(doc) super(FAISSDocumentStore, self).write_documents(docs_to_write_in_sql, index=index) self.faiss_index = faiss_index
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally, you can also supply "tags": ["one-tag", "another-one"] or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...} :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ # Make sure we comply to Document class format documents = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] index = index or self.index for doc in documents: row = DocumentORM(id=doc.id, text=doc.text, meta_data=doc.meta, index=index) # type: ignore self.session.add(row) self.session.commit()
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries in Elasticsearch. When using explicit document IDs, any existing document with the same ID gets updated. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field. :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. :return: None """ if index and not self.client.indices.exists(index=index): self._create_document_index(index) if index is None: index = self.index # Make sure we comply to Document class format documents_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] documents_to_index = [] for doc in documents_objects: _doc = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **doc.to_dict() } # type: Dict[str, Any] # rename id for elastic _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields _ = _doc.pop("query_score", None) _doc = {k: v for k, v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, # we "unnest" all value within "meta" if "meta" in _doc.keys(): for k, v in _doc["meta"].items(): _doc[k] = v _doc.pop("meta") documents_to_index.append(_doc) bulk(self.client, documents_to_index, request_timeout=300, refresh="wait_for")
def no_answer_prediction(no_answer_reader, test_docs_xs): docs = [ Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs ] prediction = no_answer_reader.predict( question="What is the meaning of life?", documents=docs, top_k=5) return prediction
def prediction(reader, test_docs_xs): docs = [ Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs ] prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) return prediction
def test_top_k(test_docs_xs): # TODO parametrize top_k and farm/transformers reader using pytest # TODO transformers reader was crashing when tested on this docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs] farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0, use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4) for top_k in [2, 5, 10]: prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k) assert len(prediction["answers"]) == top_k
def test_context_window_size(test_docs_xs): # TODO parametrize window_size and farm/transformers reader using pytest docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs] for window_size in [10, 15, 20]: farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0, use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size) prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) for answer in prediction["answers"]: # If the extracted answer is larger than the context window, the context window is expanded. # If the extracted answer is odd in length, the resulting context window is one less than context_window_size # due to rounding (See FARM's QACandidate) # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different if len(answer["answer"]) <= window_size: assert len(answer["context"]) in [window_size, window_size-1] else: assert len(answer["answer"]) == len(answer["context"])
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally, you can also supply "tags": ["one-tag", "another-one"] or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...} :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ index = index or self.index documents_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents] for document in documents_objects: self.indexes[index][document.id] = document #TODO fix tags after id refactoring tags = document.tags self._map_tags_to_ids(document.id, tags)