Beispiel #1
0
def test_get_meta_values_by_key(document_store):
    documents = [
        Document(
            text="Doc1",
            meta={"meta_key_1": "1", "meta_key_2": "11"}
        ),
        Document(
            text="Doc2",
            meta={"meta_key_1": "2", "meta_key_2": "22"}
        ),
        Document(
            text="Doc3",
            meta={"meta_key_1": "3", "meta_key_2": "33"}
        )
    ]
    document_store.write_documents(documents)

    # test without filters or query
    result = document_store.get_metadata_values_by_key(key="meta_key_1")
    for bucket in result:
        assert bucket["value"] in ["1", "2", "3"]
        assert bucket["count"] == 1

    # test with filters but no query
    result = document_store.get_metadata_values_by_key(key="meta_key_1", filters={"meta_key_2": ["11", "22"]})
    for bucket in result:
        assert bucket["value"] in ["1", "2"]
        assert bucket["count"] == 1

    # test with filters & query
    result = document_store.get_metadata_values_by_key(key="meta_key_1", query="Doc1")
    for bucket in result:
        assert bucket["value"] in ["1"]
        assert bucket["count"] == 1
Beispiel #2
0
def test_write_with_duplicate_doc_ids(document_store):
    documents = [
        Document(text="Doc1", id_hash_keys=["key1"]),
        Document(text="Doc2", id_hash_keys=["key1"])
    ]
    with pytest.raises(Exception):
        document_store.write_documents(documents)
Beispiel #3
0
def test_elasticsearch_update_meta(document_store):
    documents = [
        Document(text="Doc1", meta={
            "meta_key_1": "1",
            "meta_key_2": "1"
        }),
        Document(text="Doc2", meta={
            "meta_key_1": "2",
            "meta_key_2": "2"
        }),
        Document(text="Doc3", meta={
            "meta_key_1": "3",
            "meta_key_2": "3"
        })
    ]
    document_store.write_documents(documents)
    document_2 = document_store.get_all_documents(
        filters={"meta_key_2": ["2"]})[0]
    document_store.update_document_meta(document_2.id,
                                        meta={
                                            "meta_key_1": "99",
                                            "meta_key_2": "2"
                                        })
    updated_document = document_store.get_document_by_id(document_2.id)
    assert len(updated_document.meta.keys()) == 2
    assert updated_document.meta["meta_key_1"] == "99"
    assert updated_document.meta["meta_key_2"] == "2"
Beispiel #4
0
def test_write_document_meta(document_store):
    documents = [
        {
            "text": "dict_without_meta",
            "id": "1"
        },
        {
            "text": "dict_with_meta",
            "meta_field": "test2",
            "name": "filename2",
            "id": "2"
        },
        Document(text="document_object_without_meta", id="3"),
        Document(text="document_object_with_meta",
                 meta={
                     "meta_field": "test4",
                     "name": "filename3"
                 },
                 id="4"),
    ]
    document_store.write_documents(documents)
    documents_in_store = document_store.get_all_documents()
    assert len(documents_in_store) == 4

    assert not document_store.get_document_by_id("1").meta
    assert document_store.get_document_by_id("2").meta["meta_field"] == "test2"
    assert not document_store.get_document_by_id("3").meta
    assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"
Beispiel #5
0
def test_generate_doc_id_using_text():
    text1 = "text1"
    text2 = "text2"
    doc1_text1 = Document(text=text1, meta={"name": "doc1"})
    doc2_text1 = Document(text=text1, meta={"name": "doc2"})
    doc3_text2 = Document(text=text2, meta={"name": "doc3"})

    assert doc1_text1.id == doc2_text1.id
    assert doc1_text1.id != doc3_text2.id
Beispiel #6
0
def test_dpr_retrieval(document_store, retriever, return_embedding):

    documents = [
        Document(
            text="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""",
            meta={"name": "0"}
        ),
        Document(
            text="""Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with""",
        ),
        Document(
            text="""Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are""",
            meta={"name": "1"}
        ),
        Document(
            text="""The Dothraki vocabulary was created by David J. Peterson well in advance of the adaptation. HBO hired the Language Creatio""",
            meta={"name": "2"}
        ),
        Document(
            text="""The title of the episode refers to the Great Sept of Baelor, the main religious building in King's Landing, where the episode's pivotal scene takes place. In the world created by George R. R. Martin""",
            meta={}
        )
    ]

    document_store.return_embedding = return_embedding
    document_store.write_documents(documents)
    document_store.update_embeddings(retriever=retriever)

    time.sleep(1)

    docs_with_emb = document_store.get_all_documents()

    if return_embedding is True:
        assert (len(docs_with_emb[0].embedding) == 768)
        assert (abs(docs_with_emb[0].embedding[0] - (-0.3063)) < 0.001)
        assert (abs(docs_with_emb[1].embedding[0] - (-0.3914)) < 0.001)
        assert (abs(docs_with_emb[2].embedding[0] - (-0.2470)) < 0.001)
        assert (abs(docs_with_emb[3].embedding[0] - (-0.0802)) < 0.001)
        assert (abs(docs_with_emb[4].embedding[0] - (-0.0551)) < 0.001)

    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")

    assert res[0].meta["name"] == "1"

    # test embedding
    if return_embedding is True:
        assert res[0].embedding is not None
    else:
        assert res[0].embedding is None

    # test filtering
    if not isinstance(document_store, FAISSDocumentStore):
        res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?", filters={"name": ["0", "2"]})
        assert len(res) == 2
        for r in res:
            assert r.meta["name"] in ["0", "2"]
Beispiel #7
0
def test_write_with_duplicate_doc_ids(document_store):
    id = get_uuid()
    documents = [
        Document(text="Doc1",
                 id=id,
                 embedding=np.random.rand(embedding_dim).astype(np.float32)),
        Document(text="Doc2",
                 id=id,
                 embedding=np.random.rand(embedding_dim).astype(np.float32))
    ]
    document_store.write_documents(documents, duplicate_documents="skip")
    with pytest.raises(Exception):
        document_store.write_documents(documents, duplicate_documents="fail")
Beispiel #8
0
def test_get_all_document_filter_duplicate_value(document_store):
    documents = [
        Document(text="Doc1", meta={"f1": "0"}),
        Document(text="Doc1", meta={
            "f1": "1",
            "meta_id": "0"
        }),
        Document(text="Doc2", meta={"f3": "0"})
    ]
    document_store.write_documents(documents)
    documents = document_store.get_all_documents(filters={"f1": ["1"]})
    assert documents[0].text == "Doc1"
    assert len(documents) == 1
    assert {d.meta["meta_id"] for d in documents} == {"0"}
Beispiel #9
0
    def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

        score = hit["_score"] if hit["_score"] else None
        if score:
            if adapt_score_for_embedding:
                score -= 1000
                probability = (score + 1) / 2  # scaling probability from cosine similarity
            else:
                probability = float(expit(np.asarray(score / 8)))  # scaling probability from TFIDF/BM25
        else:
            probability = None
        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
            score=score,
            probability=probability,
            question=hit["_source"].get(self.faq_question_field),
            embedding=hit["_source"].get(self.embedding_field, None)
        )
        return document
Beispiel #10
0
    def predict_on_texts(self,
                         question: str,
                         texts: List[str],
                         top_k: Optional[int] = None):
        """
        Use loaded QA model to find answers for a question in the supplied list of Document.
        Returns dictionaries containing answers sorted by (desc.) probability.
        Example:

            {
                'question': 'Who is the father of Arya Stark?',
                'answers':[
                             {'answer': 'Eddard,',
                             'context': " She travels with her father, Eddard, to King's Landing when he is ",
                             'offset_answer_start': 147,
                             'offset_answer_end': 154,
                             'probability': 0.9787139466668613,
                             'score': None,
                             'document_id': '1337'
                             },...
                          ]
            }

        :param question: Question string
        :param documents: List of documents as string type
        :param top_k: The maximum number of answers to return
        :return: Dict containing question and answers
        """
        documents = []
        for text in texts:
            documents.append(Document(text=text))
        predictions = self.predict(question, documents, top_k)
        return predictions
Beispiel #11
0
def test_generate_doc_id_using_custom_list():
    text1 = "text1"
    text2 = "text2"

    doc1_text1 = Document(text=text1,
                          meta={"name": "doc1"},
                          id_hash_keys=["key1", text1])
    doc2_text1 = Document(text=text1,
                          meta={"name": "doc2"},
                          id_hash_keys=["key1", text1])
    doc3_text2 = Document(text=text2,
                          meta={"name": "doc3"},
                          id_hash_keys=["key1", text2])

    assert doc1_text1.id == doc2_text1.id
    assert doc1_text1.id != doc3_text2.id
Beispiel #12
0
    def _convert_weaviate_result_to_document(
            self, result: dict, return_embedding: bool) -> Document:
        """
        Convert weaviate result dict into haystack document object. This is more involved because
        weaviate search result dict varies between get and query interfaces.
        Weaviate get methods return the data items in properties key, whereas the query doesn't.
        """
        score = None
        probability = None
        text = ""
        question = None

        id = result.get("id")
        embedding = result.get("vector")

        # If properties key is present, get all the document fields from it.
        # otherwise, a direct lookup in result root dict
        props = result.get("properties")
        if not props:
            props = result

        if props.get(self.text_field) is not None:
            text = str(props.get(self.text_field))

        if props.get(self.faq_question_field) is not None:
            question = props.get(self.faq_question_field)

        # Weaviate creates "_additional" key for semantic search
        if "_additional" in props:
            if "certainty" in props["_additional"]:
                score = props["_additional"]['certainty']
                probability = score
            if "id" in props["_additional"]:
                id = props["_additional"]['id']
            if "vector" in props["_additional"]:
                embedding = props["_additional"]['vector']
            props.pop("_additional", None)

        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {
            k: v
            for k, v in props.items()
            if k not in (self.text_field, self.faq_question_field,
                         self.embedding_field)
        }

        if return_embedding and embedding:
            embedding = np.asarray(embedding, dtype=np.float32)

        document = Document(
            id=id,
            text=text,
            meta=meta_data,
            score=score,
            probability=probability,
            question=question,
            embedding=embedding,
        )
        return document
Beispiel #13
0
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(id=row.id,
                         text=row.text,
                         meta={meta.name: meta.value
                               for meta in row.meta})
     if row.vector_id:
         document.meta["vector_id"] = row.vector_id
     return document
Beispiel #14
0
    def query_by_embedding(
            self,
            query_emb: List[float],
            filters: Optional[Dict[str, List[str]]] = None,
            top_k: int = 10,
            index: Optional[str] = None,
            return_embedding: Optional[bool] = None) -> List[Document]:
        """
        Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.

        :param query_emb: Embedding of the query (e.g. gathered from DPR)
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param top_k: How many documents to return
        :param index: Index name for storing the docs and metadata
        :param return_embedding: To return document embedding
        :return:
        """

        from numpy import dot
        from numpy.linalg import norm

        if filters:
            raise NotImplementedError(
                "Setting `filters` is currently not supported in "
                "InMemoryDocumentStore.query_by_embedding(). Please remove filters or "
                "use a different DocumentStore (e.g. ElasticsearchDocumentStore)."
            )

        index = index or self.index
        if return_embedding is None:
            return_embedding = self.return_embedding

        if query_emb is None:
            return []

        candidate_docs = []
        for idx, doc in self.indexes[index].items():
            curr_meta = deepcopy(doc.meta)
            new_document = Document(id=doc.id,
                                    text=doc.text,
                                    meta=curr_meta,
                                    embedding=doc.embedding)
            new_document.embedding = doc.embedding if return_embedding is True else None

            if self.similarity == "dot_product":
                score = dot(query_emb, doc.embedding) / (norm(query_emb) *
                                                         norm(doc.embedding))
            elif self.similarity == "cosine":
                # cosine similarity score = 1 - cosine distance
                score = 1 - cosine(query_emb, doc.embedding)
            new_document.score = score
            new_document.probability = (score + 1) / 2
            candidate_docs.append(new_document)

        return sorted(candidate_docs,
                      key=lambda x: x.score if x.score is not None else 0.0,
                      reverse=True)[0:top_k]
Beispiel #15
0
def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. 
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        if "title" not in data["data"][0]:
            logger.warning(f"No title information found for documents in QA file: {filename}")
        for document in data["data"][:max_docs]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document.get("title", None)}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Beispiel #16
0
def test_docs_xs():
    return [
        # current "dict" format for a document
        {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
        # meta_field at the top level for backward compatibility
        {"text": "My name is Paul and I live in New York", "meta_field": "test2", "name": "filename2"},
        # Document object for a doc
        Document(text="My name is Christelle and I live in Paris", meta={"meta_field": "test3", "name": "filename3"})
    ]
Beispiel #17
0
    def get_all_documents(
            self,
            index: Optional[str] = None,
            filters: Optional[Dict[str, List[str]]] = None,
            return_embedding: Optional[bool] = None
    ) -> List[Document]:
        """
        Get documents from the document store.

        :param index: Name of the index to get the documents from. If None, the
                      DocumentStore's default index (self.index) will be used.
        :param filters: Optional filters to narrow down the documents to return.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param return_embedding: Whether to return the document embeddings.
        """

        index = index or self.index
        # Generally ORM objects kept in memory cause performance issue
        # Hence using directly column name improve memory and performance.
        # Refer https://stackoverflow.com/questions/23185319/why-is-loading-sqlalchemy-objects-via-the-orm-5-8x-slower-than-rows-via-a-raw-my
        documents_query = self.session.query(
            DocumentORM.id,
            DocumentORM.text,
            DocumentORM.vector_id
        ).filter_by(index=index)

        if filters:
            documents_query = documents_query.join(MetaORM)
            for key, values in filters.items():
                documents_query = documents_query.filter(
                    MetaORM.name == key,
                    MetaORM.value.in_(values),
                    DocumentORM.id == MetaORM.document_id
                )

        documents_map = {}
        for row in documents_query.all():
            documents_map[row.id] = Document(
                id=row.id,
                text=row.text,
                meta=None if row.vector_id is None else {"vector_id": row.vector_id} # type: ignore
            )

        for doc_ids in self.chunked_iterable(documents_map.keys(), size=self.batch_size):
            meta_query = self.session.query(
                MetaORM.document_id,
                MetaORM.name,
                MetaORM.value
            ).filter(MetaORM.document_id.in_(doc_ids))

            for row in meta_query.all():
                if documents_map[row.document_id].meta is None:
                    documents_map[row.document_id].meta = {}
                documents_map[row.document_id].meta[row.name] = row.value # type: ignore

        return list(documents_map.values())
Beispiel #18
0
    def retrieve(self,
                 query: str,
                 filters: dict = None,
                 top_k: Optional[int] = None,
                 index: str = None) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query.

        :param query: The query
        :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
        :param top_k: How many documents to return per query.
        :param index: The name of the index in the DocumentStore from which to retrieve documents
        """
        if top_k is None:
            top_k = self.top_k
        if index is None:
            index = self.document_store.index

        body = {
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["text", "name"]
                }
            },
            "highlight": {
                "fields": {
                    "text": {}
                }
            },
            "_source": True
        }
        es_client = self.document_store.get_elastic_client()
        result = es_client.search(index=index, body=body)["hits"]["hits"]
        documents = []
        for hit in result:
            score = hit["_score"] if hit["_score"] else None
            if score:
                probability = float(expit(np.asarray(
                    score / 8)))  # scaling probability from TFIDF/BM25
            else:
                probability = None
            answer = hit['highlight']['text'][0]
            answer = answer.replace("<em>", "")
            answer = answer.replace("</em>", "")
            document = Document(
                id=hit["_id"],
                text=hit["_source"].get("text"),
                meta={"answer": answer},
                score=score,
                probability=probability,
                question=query,
            )
            documents.append(document)
        return documents
Beispiel #19
0
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document["title"]}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Beispiel #20
0
    def _query(self,
               index: Optional[str] = None,
               filters: Optional[Dict[str, List[str]]] = None,
               vector_ids: Optional[List[str]] = None,
               only_documents_without_embedding: bool = False,
               batch_size: int = 10_000):
        """
        :param index: Name of the index to get the documents from. If None, the
                      DocumentStore's default index (self.index) will be used.
        :param filters: Optional filters to narrow down the documents to return.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param vector_ids: List of vector_id strings to filter the documents by.
        :param only_documents_without_embedding: return only documents without an embedding.
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        """
        index = index or self.index
        # Generally ORM objects kept in memory cause performance issue
        # Hence using directly column name improve memory and performance.
        # Refer https://stackoverflow.com/questions/23185319/why-is-loading-sqlalchemy-objects-via-the-orm-5-8x-slower-than-rows-via-a-raw-my
        documents_query = self.session.query(
            DocumentORM.id, DocumentORM.text,
            DocumentORM.vector_id).filter_by(index=index)

        if filters:
            documents_query = documents_query.join(MetaORM)
            for key, values in filters.items():
                documents_query = documents_query.filter(
                    MetaORM.name == key, MetaORM.value.in_(values),
                    DocumentORM.id == MetaORM.document_id)
        if only_documents_without_embedding:
            documents_query = documents_query.filter(
                DocumentORM.vector_id.is_(None))
        if vector_ids:
            documents_query = documents_query.filter(
                DocumentORM.vector_id.in_(vector_ids))

        documents_map = {}

        if self.use_windowed_query:
            documents_query = self._windowed_query(documents_query,
                                                   DocumentORM.id, batch_size)

        for i, row in enumerate(documents_query, start=1):
            documents_map[row.id] = Document(id=row.id,
                                             text=row.text,
                                             meta=None if row.vector_id is None
                                             else {"vector_id": row.vector_id})
            if i % batch_size == 0:
                documents_map = self._get_documents_meta(documents_map)
                yield from documents_map.values()
                documents_map = {}
        if documents_map:
            documents_map = self._get_documents_meta(documents_map)
            yield from documents_map.values()
Beispiel #21
0
    def query_by_embedding(
            self,
            query_emb: np.ndarray,
            filters: Optional[Dict[str, List[str]]] = None,
            top_k: int = 10,
            index: Optional[str] = None,
            return_embedding: Optional[bool] = None) -> List[Document]:
        """
        Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.

        :param query_emb: Embedding of the query (e.g. gathered from DPR)
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param top_k: How many documents to return
        :param index: Index name for storing the docs and metadata
        :param return_embedding: To return document embedding
        :return:
        """

        from numpy import dot
        from numpy.linalg import norm

        index = index or self.index
        if return_embedding is None:
            return_embedding = self.return_embedding

        if query_emb is None:
            return []

        document_to_search = self.get_all_documents(index=index,
                                                    filters=filters,
                                                    return_embedding=True)
        candidate_docs = []
        for doc in document_to_search:
            curr_meta = deepcopy(doc.meta)
            new_document = Document(id=doc.id,
                                    text=doc.text,
                                    meta=curr_meta,
                                    embedding=doc.embedding)
            new_document.embedding = doc.embedding if return_embedding is True else None

            if self.similarity == "dot_product":
                score = dot(query_emb, doc.embedding) / (norm(query_emb) *
                                                         norm(doc.embedding))
            elif self.similarity == "cosine":
                # cosine similarity score = 1 - cosine distance
                score = 1 - cosine(query_emb, doc.embedding)
            new_document.score = score
            new_document.probability = (score + 1) / 2
            candidate_docs.append(new_document)

        return sorted(candidate_docs,
                      key=lambda x: x.score if x.score is not None else 0.0,
                      reverse=True)[0:top_k]
Beispiel #22
0
    def retrieve(self,
                 query: str,
                 filters: dict = None,
                 top_k: Optional[int] = None,
                 index: str = None) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query.

        :param query: The query
        :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
        :param top_k: How many documents to return per query.
        :param index: The name of the index in the DocumentStore from which to retrieve documents
        """
        if self.df is None:
            raise Exception("fit() needs to called before retrieve()")

        if filters:
            raise NotImplementedError(
                "Filters are not implemented in TfidfRetriever.")
        if index:
            raise NotImplementedError(
                "Switching index is not supported in TfidfRetriever.")

        if top_k is None:
            top_k = self.top_k
        # get scores
        indices_and_scores = self._calc_scores(query)

        # rank paragraphs
        df_sliced = self.df.loc[indices_and_scores.keys()]
        df_sliced = df_sliced[:top_k]

        logger.debug(
            f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
        )

        # get actual content for the top candidates
        paragraphs = list(df_sliced.text.values)
        meta_data = [{
            "document_id": row["document_id"],
            "paragraph_id": row["paragraph_id"],
            "meta": row.get("meta", {})
        } for idx, row in df_sliced.iterrows()]

        documents = []
        for para, meta in zip(paragraphs, meta_data):
            documents.append(
                Document(id=meta["document_id"],
                         text=para,
                         meta=meta.get("meta", {})))

        return documents
Beispiel #23
0
def test_update_existing_docs(document_store, retriever):
    document_store.duplicate_documents = "overwrite"
    old_document = Document(text="text_1")
    # initial write
    document_store.write_documents([old_document])
    document_store.update_embeddings(retriever=retriever)
    old_documents_indexed = document_store.get_all_documents()
    assert len(old_documents_indexed) == 1

    # Update document data
    new_document = Document(text="text_2")
    new_document.id = old_document.id
    document_store.write_documents([new_document])
    document_store.update_embeddings(retriever=retriever)
    new_documents_indexed = document_store.get_all_documents()
    assert len(new_documents_indexed) == 1

    assert old_documents_indexed[0].id == new_documents_indexed[0].id
    assert old_documents_indexed[0].text == "text_1"
    assert new_documents_indexed[0].text == "text_2"
    assert not np.allclose(old_documents_indexed[0].embedding, new_documents_indexed[0].embedding, rtol=0.01)
Beispiel #24
0
def _extract_docs_and_labels_from_dict(document_dict: Dict):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)
        # Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        docs.append(cur_doc)

        # Get Labels
        for qa in paragraph["qas"]:
            if len(qa["answers"]) > 0:
                for answer in qa["answers"]:
                    label = Label(
                        question=qa["question"],
                        answer=answer["text"],
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_doc.id,
                        offset_start_in_doc=answer["answer_start"],
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                label = Label(
                    question=qa["question"],
                    answer="",
                    is_correct_answer=True,
                    is_correct_document=True,
                    document_id=cur_doc.id,
                    offset_start_in_doc=0,
                    no_answer=qa["is_impossible"],
                    origin="gold_label",
                )
                labels.append(label)

    return docs, labels
Beispiel #25
0
    def predict(
            self,
            documents: List[Document],
            generate_single_summary: Optional[bool] = None) -> List[Document]:
        """
        Produce the summarization from the supplied documents.
        These document can for example be retrieved via the Retriever.

        :param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
        :param generate_single_summary: Whether to generate a single summary for all documents or one summary per document.
                                        If set to "True", all docs will be joined to a single string that will then
                                        be summarized.
                                        Important: The summary will depend on the order of the supplied documents!
        :return: List of Documents, where Document.text contains the summarization and Document.meta["context"]
                 the original, not summarized text
        """

        if self.min_length > self.max_length:
            raise AttributeError(
                "min_length cannot be greater than max_length")

        if len(documents) == 0:
            raise AttributeError(
                "Summarizer needs at least one document to produce a summary.")

        if generate_single_summary is None:
            generate_single_summary = self.generate_single_summary

        contexts: List[str] = [doc.text for doc in documents]

        if generate_single_summary:
            # Documents order is very important to produce summary.
            # Different order of same documents produce different summary.
            contexts = [self.separator_for_single_summary.join(contexts)]

        summaries = self.summarizer(
            contexts,
            min_length=self.min_length,
            max_length=self.max_length,
            return_text=True,
            clean_up_tokenization_spaces=self.clean_up_tokenization_spaces,
        )

        result: List[Document] = []

        for context, summarized_answer in zip(contexts, summaries):
            cur_doc = Document(text=summarized_answer['summary_text'],
                               meta={"context": context})
            result.append(cur_doc)

        return result
Beispiel #26
0
def test_get_all_document_filter_duplicate_value(document_store):
    documents = [
        Document(text="Doc1",
                 meta={"fone": "f0"},
                 id=get_uuid(),
                 embedding=np.random.rand(embedding_dim).astype(np.float32)),
        Document(text="Doc1",
                 meta={
                     "fone": "f1",
                     "metaid": "0"
                 },
                 id=get_uuid(),
                 embedding=np.random.rand(embedding_dim).astype(np.float32)),
        Document(text="Doc2",
                 meta={"fthree": "f0"},
                 id=get_uuid(),
                 embedding=np.random.rand(embedding_dim).astype(np.float32))
    ]
    document_store.write_documents(documents)
    documents = document_store.get_all_documents(filters={"fone": ["f1"]})
    assert documents[0].text == "Doc1"
    assert len(documents) == 1
    assert {d.meta["metaid"] for d in documents} == {"0"}
Beispiel #27
0
    def _convert_es_hit_to_document(
        self,
        hit: dict,
        return_embedding: bool,
        adapt_score_for_embedding: bool = False,
    ) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {
            k: v
            for k, v in hit["_source"].items()
            if k not in (self.text_field, self.faq_question_field,
                         self.embedding_field)
        }
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

        score = hit["_score"] if hit["_score"] else None
        if score:
            if adapt_score_for_embedding:
                score = self._scale_embedding_score(score)
                if self.similarity == "cosine":
                    probability = (
                        score +
                        1) / 2  # scaling probability from cosine similarity
                elif self.similarity == "dot_product":
                    probability = float(expit(np.asarray(
                        score / 100)))  # scaling probability from dot product
            else:
                probability = float(expit(np.asarray(
                    score / 8)))  # scaling probability from TFIDF/BM25
        else:
            probability = None

        embedding = None
        if return_embedding:
            embedding_list = hit["_source"].get(self.embedding_field)
            if embedding_list:
                embedding = np.asarray(embedding_list, dtype=np.float32)

        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
            score=score,
            probability=probability,
            question=hit["_source"].get(self.faq_question_field),
            embedding=embedding,
        )
        return document
Beispiel #28
0
def test_lfqa_pipeline(document_store, retriever, eli5_generator):
    # reuse existing DOCS but regenerate embeddings with retribert
    docs: List[Document] = []
    for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
        docs.append(Document(d.text, str(idx)))
    document_store.write_documents(docs)
    document_store.update_embeddings(retriever)
    query = "Tell me about Berlin?"
    pipeline = GenerativeQAPipeline(retriever=retriever,
                                    generator=eli5_generator)
    output = pipeline.run(query=query, top_k_generator=1, top_k_retriever=1)
    answers = output["answers"]
    assert len(answers) == 1
    assert "Germany" in answers[0]
Beispiel #29
0
def test_classifier(classifier):
    assert isinstance(classifier, BaseClassifier)

    query = "not used at the moment"
    docs = [
        Document(
            text=
            """Fragen und Antworten - Bitte auf Themen beschränken	welche einen Bezug zur Bahn aufweisen. Persönliche Unterhaltungen bitte per PN führen. Links bitte mit kurzer Erklärung zum verlinkten Inhalt versehen""",
            meta={"name": "0"},
            id="1",
        ),
        Document(
            text=
            """Ich liebe es wenn die Bahn selbstverschuldete unnötig lange Aufenthaltszeiten durch Verspätung wieder rausfährt.""",
            meta={"name": "1"},
            id="2",
        ),
    ]
    results = classifier.predict(query=query, documents=docs)
    expected_labels = ["neutral", "negative"]
    for i, doc in enumerate(results):
        assert doc.to_dict(
        )["meta"]["classification"]["label"] == expected_labels[i]
Beispiel #30
0
def test_write_document_meta(document_store):
    uid1 = get_uuid()
    uid2 = get_uuid()
    uid3 = get_uuid()
    uid4 = get_uuid()
    documents = [
        {
            "text": "dict_without_meta",
            "id": uid1,
            "embedding": np.random.rand(embedding_dim).astype(np.float32)
        },
        {
            "text": "dict_with_meta",
            "metafield": "test2",
            "name": "filename2",
            "id": uid2,
            "embedding": np.random.rand(embedding_dim).astype(np.float32)
        },
        Document(text="document_object_without_meta",
                 id=uid3,
                 embedding=np.random.rand(embedding_dim).astype(np.float32)),
        Document(text="document_object_with_meta",
                 meta={
                     "metafield": "test4",
                     "name": "filename3"
                 },
                 id=uid4,
                 embedding=np.random.rand(embedding_dim).astype(np.float32)),
    ]
    document_store.write_documents(documents)
    documents_in_store = document_store.get_all_documents()
    assert len(documents_in_store) == 4

    assert not document_store.get_document_by_id(uid1).meta
    assert document_store.get_document_by_id(uid2).meta["metafield"] == "test2"
    assert not document_store.get_document_by_id(uid3).meta
    assert document_store.get_document_by_id(uid4).meta["metafield"] == "test4"