def test_write_with_duplicate_doc_ids(document_store): documents = [ Document(text="Doc1", id_hash_keys=["key1"]), Document(text="Doc2", id_hash_keys=["key1"]) ] with pytest.raises(Exception): document_store.write_documents(documents)
def test_get_meta_values_by_key(document_store): documents = [ Document( text="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"} ), Document( text="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"} ), Document( text="Doc3", meta={"meta_key_1": "3", "meta_key_2": "33"} ) ] document_store.write_documents(documents) # test without filters or query result = document_store.get_metadata_values_by_key(key="meta_key_1") for bucket in result: assert bucket["value"] in ["1", "2", "3"] assert bucket["count"] == 1 # test with filters but no query result = document_store.get_metadata_values_by_key(key="meta_key_1", filters={"meta_key_2": ["11", "22"]}) for bucket in result: assert bucket["value"] in ["1", "2"] assert bucket["count"] == 1 # test with filters & query result = document_store.get_metadata_values_by_key(key="meta_key_1", query="Doc1") for bucket in result: assert bucket["value"] in ["1"] assert bucket["count"] == 1
def query_by_embedding( self, query_emb: List[float], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]: """ Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. :param query_emb: Embedding of the query (e.g. gathered from DPR) :param filters: Optional filters to narrow down the search space. Example: {"name": ["some", "more"], "category": ["only_one"]} :param top_k: How many documents to return :param index: Index name for storing the docs and metadata :param return_embedding: To return document embedding :return: """ from numpy import dot from numpy.linalg import norm if filters: raise NotImplementedError( "Setting `filters` is currently not supported in " "InMemoryDocumentStore.query_by_embedding(). Please remove filters or " "use a different DocumentStore (e.g. ElasticsearchDocumentStore)." ) index = index or self.index if return_embedding is None: return_embedding = self.return_embedding if query_emb is None: return [] candidate_docs = [] for idx, doc in self.indexes[index].items(): curr_meta = deepcopy(doc.meta) new_document = Document(id=doc.id, text=doc.text, meta=curr_meta, embedding=doc.embedding) new_document.embedding = doc.embedding if return_embedding is True else None if self.similarity == "dot_product": score = dot(query_emb, doc.embedding) / (norm(query_emb) * norm(doc.embedding)) elif self.similarity == "cosine": # cosine similarity score = 1 - cosine distance score = 1 - cosine(query_emb, doc.embedding) new_document.score = score new_document.probability = (score + 1) / 2 candidate_docs.append(new_document) return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k]
def test_generate_doc_id_using_text(): text1 = "text1" text2 = "text2" doc1_text1 = Document(text=text1, meta={"name": "doc1"}) doc2_text1 = Document(text=text1, meta={"name": "doc2"}) doc3_text2 = Document(text=text2, meta={"name": "doc3"}) assert doc1_text1.id == doc2_text1.id assert doc1_text1.id != doc3_text2.id
def test_dpr_retrieval(document_store, retriever, return_embedding): documents = [ Document( text="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""", meta={"name": "0"} ), Document( text="""Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with""", ), Document( text="""Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are""", meta={"name": "1"} ), Document( text="""The Dothraki vocabulary was created by David J. Peterson well in advance of the adaptation. HBO hired the Language Creatio""", meta={"name": "2"} ), Document( text="""The title of the episode refers to the Great Sept of Baelor, the main religious building in King's Landing, where the episode's pivotal scene takes place. In the world created by George R. R. Martin""", meta={} ) ] document_store.return_embedding = return_embedding document_store.write_documents(documents) document_store.update_embeddings(retriever=retriever) time.sleep(1) docs_with_emb = document_store.get_all_documents() if return_embedding is True: assert (len(docs_with_emb[0].embedding) == 768) assert (abs(docs_with_emb[0].embedding[0] - (-0.3063)) < 0.001) assert (abs(docs_with_emb[1].embedding[0] - (-0.3914)) < 0.001) assert (abs(docs_with_emb[2].embedding[0] - (-0.2470)) < 0.001) assert (abs(docs_with_emb[3].embedding[0] - (-0.0802)) < 0.001) assert (abs(docs_with_emb[4].embedding[0] - (-0.0551)) < 0.001) res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?") assert res[0].meta["name"] == "1" # test embedding if return_embedding is True: assert res[0].embedding is not None else: assert res[0].embedding is None # test filtering if not isinstance(document_store, FAISSDocumentStore): res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?", filters={"name": ["0", "2"]}) assert len(res) == 2 for r in res: assert r.meta["name"] in ["0", "2"]
def test_write_with_duplicate_doc_ids(document_store): id = get_uuid() documents = [ Document(text="Doc1", id=id, embedding=np.random.rand(embedding_dim).astype(np.float32)), Document(text="Doc2", id=id, embedding=np.random.rand(embedding_dim).astype(np.float32)) ] document_store.write_documents(documents, duplicate_documents="skip") with pytest.raises(Exception): document_store.write_documents(documents, duplicate_documents="fail")
def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document: # We put all additional data of the doc into meta_data and return it in the API meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)} name = meta_data.pop(self.name_field, None) if name: meta_data["name"] = name score = hit["_score"] if hit["_score"] else None if score: if adapt_score_for_embedding: score -= 1000 probability = (score + 1) / 2 # scaling probability from cosine similarity else: probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25 else: probability = None document = Document( id=hit["_id"], text=hit["_source"].get(self.text_field), meta=meta_data, score=score, probability=probability, question=hit["_source"].get(self.faq_question_field), embedding=hit["_source"].get(self.embedding_field, None) ) return document
def test_context_window_size(reader, test_docs_xs, window_size): docs = [ Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs ] assert isinstance(reader, FARMReader) old_window_size = reader.inferencer.model.prediction_heads[ 0].context_window_size reader.inferencer.model.prediction_heads[ 0].context_window_size = window_size prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5) for answer in prediction["answers"]: # If the extracted answer is larger than the context window, the context window is expanded. # If the extracted answer is odd in length, the resulting context window is one less than context_window_size # due to rounding (See FARM's QACandidate) # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different if len(answer["answer"]) <= window_size: assert len(answer["context"]) in [window_size, window_size - 1] else: assert len(answer["answer"]) == len(answer["context"]) reader.inferencer.model.prediction_heads[ 0].context_window_size = old_window_size
def test_generate_doc_id_using_custom_list(): text1 = "text1" text2 = "text2" doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1]) doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1]) doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2]) assert doc1_text1.id == doc2_text1.id assert doc1_text1.id != doc3_text2.id
def _convert_weaviate_result_to_document( self, result: dict, return_embedding: bool) -> Document: """ Convert weaviate result dict into haystack document object. This is more involved because weaviate search result dict varies between get and query interfaces. Weaviate get methods return the data items in properties key, whereas the query doesn't. """ score = None probability = None text = "" question = None id = result.get("id") embedding = result.get("vector") # If properties key is present, get all the document fields from it. # otherwise, a direct lookup in result root dict props = result.get("properties") if not props: props = result if props.get(self.text_field) is not None: text = str(props.get(self.text_field)) if props.get(self.faq_question_field) is not None: question = props.get(self.faq_question_field) # Weaviate creates "_additional" key for semantic search if "_additional" in props: if "certainty" in props["_additional"]: score = props["_additional"]['certainty'] probability = score if "id" in props["_additional"]: id = props["_additional"]['id'] if "vector" in props["_additional"]: embedding = props["_additional"]['vector'] props.pop("_additional", None) # We put all additional data of the doc into meta_data and return it in the API meta_data = { k: v for k, v in props.items() if k not in (self.text_field, self.faq_question_field, self.embedding_field) } if return_embedding and embedding: embedding = np.asarray(embedding, dtype=np.float32) document = Document( id=id, text=text, meta=meta_data, score=score, probability=probability, question=question, embedding=embedding, ) return document
def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in SQuAD format :param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. :return: (List of Documents, List of Labels) """ docs = [] labels = [] with open(filename, "r") as file: data = json.load(file) if "title" not in data["data"][0]: logger.warning(f"No title information found for documents in QA file: {filename}") for document in data["data"][:max_docs]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: cur_meta = {"name": document.get("title", None)} # all other fields from paragraph level meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")} cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def test_docs_xs(): return [ # current "dict" format for a document {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}}, # meta_field at the top level for backward compatibility {"text": "My name is Paul and I live in New York", "meta_field": "test2", "name": "filename2"}, # Document object for a doc Document(text="My name is Christelle and I live in Paris", meta={"meta_field": "test3", "name": "filename3"}) ]
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries in Elasticsearch. When using explicit document IDs, any existing document with the same ID gets updated. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field. :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. :return: None """ if index and not self.client.indices.exists(index=index): self._create_document_index(index) if index is None: index = self.index # Make sure we comply to Document class format documents_objects = [Document.from_dict(d, field_map=self._create_document_field_map()) if isinstance(d, dict) else d for d in documents] documents_to_index = [] for doc in documents_objects: _doc = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **doc.to_dict(field_map=self._create_document_field_map()) } # type: Dict[str, Any] # cast embedding type as ES cannot deal with np.array if _doc[self.embedding_field] is not None: if type(_doc[self.embedding_field]) == np.ndarray: _doc[self.embedding_field] = _doc[self.embedding_field].tolist() # rename id for elastic _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields _ = _doc.pop("score", None) _ = _doc.pop("probability", None) _doc = {k:v for k,v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, # we "unnest" all value within "meta" if "meta" in _doc.keys(): for k, v in _doc["meta"].items(): _doc[k] = v _doc.pop("meta") documents_to_index.append(_doc) bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)