Esempio n. 1
0
 def predict_on_texts(self,
                      question: str,
                      texts: List[str],
                      top_k: Optional[int] = None):
     documents = []
     for text in texts:
         documents.append(Document(text=text))
     predictions = self.predict(question, documents, top_k)
     return predictions
Esempio n. 2
0
def test_docs_xs():
    return [
        # current "dict" format for a document
        {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
        # meta_field at the top level for backward compatibility
        {"text": "My name is Paul and I live in New York", "meta_field": "test2", "name": "filename2"},
        # Document object for a doc
        Document(text="My name is Christelle and I live in Paris", meta={"meta_field": "test3", "name": "filename3"})
    ]
Esempio n. 3
0
def prediction(reader, test_docs_xs):
    docs = [
        Document.from_dict(d) if isinstance(d, dict) else d
        for d in test_docs_xs
    ]
    prediction = reader.predict(question="Who lives in Berlin?",
                                documents=docs,
                                top_k=5)
    return prediction
Esempio n. 4
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries in Elasticsearch.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :return: None
        """

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index

        # Make sure we comply to Document class format
        documents_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]

        documents_to_index = []
        for doc in documents_objects:

            _doc = {
                "_op_type": "create",
                "_index": index,
                **doc.to_dict()
            }  # type: Dict[str, Any]

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("query_score", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)
        bulk(self.client,
             documents_to_index,
             request_timeout=300,
             refresh="wait_for")
Esempio n. 5
0
def test_top_k(test_docs_xs):
    # TODO parametrize top_k and farm/transformers reader using pytest
    # TODO transformers reader was crashing when tested on this

    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
                             use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4)
    for top_k in [2, 5, 10]:
        prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
        assert len(prediction["answers"]) == top_k
Esempio n. 6
0
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document["title"]}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                                )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Esempio n. 7
0
 def _convert_memory_hit_to_document(
         self,
         hit: Tuple[Any, Any],
         doc_id: Optional[str] = None) -> Document:
     document = Document(
         id=doc_id,
         text=hit[0].get('text', None),
         meta=hit[0].get('meta', {}),
         query_score=hit[1],
     )
     return document
Esempio n. 8
0
 def _convert_memory_hit_to_document(
         self,
         hit: Dict[str, Any],
         doc_id: Optional[str] = None) -> Document:
     document = Document(
         id=doc_id,
         text=hit.get("text", None),
         meta=hit.get("meta", {}),
         query_score=hit.get("query_score", None),
     )
     return document
Esempio n. 9
0
def test_dpr_inmemory_retrieval(document_store):

    documents = [
        Document(
            text=
            """Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""",
            meta={"name": "0"}),
        Document(
            text=
            """Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are""",
            meta={"name": "1"}),
        Document(
            text=
            """Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with""",
        )
    ]

    document_store.write_documents(documents, index="test_dpr")
    retriever = DensePassageRetriever(document_store=document_store,
                                      embedding_model="dpr-bert-base-nq",
                                      use_gpu=False,
                                      embed_title=True)
    document_store.update_embeddings(retriever=retriever, index="test_dpr")
    time.sleep(2)

    docs_with_emb = document_store.get_all_documents(index="test_dpr")

    # FAISSDocumentStore doesn't return embeddings, so these tests only work with ElasticsearchDocumentStore
    if isinstance(document_store, ElasticsearchDocumentStore):
        assert (len(docs_with_emb[0].embedding) == 768)
        assert (abs(docs_with_emb[0].embedding[0] - (-0.30634)) < 0.001)
        assert (abs(docs_with_emb[1].embedding[0] - (-0.24695)) < 0.001)
        assert (abs(docs_with_emb[2].embedding[0] - (-0.37449)) < 0.001)

    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?",
                             index="test_dpr")
    assert res[0].meta["name"] == "1"

    # clean up
    document_store.delete_all_documents(index="test_dpr")
Esempio n. 10
0
    def _convert_es_hit_to_document(self, hit, score_adjustment=0) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.external_source_id_field)}
        meta_data["name"] = meta_data.pop(self.name_field, None)

        document = Document(
            id=hit["_id"],
            text=hit["_source"][self.text_field],
            external_source_id=hit["_source"].get(self.external_source_id_field),
            meta=meta_data,
            query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
        )
        return document
Esempio n. 11
0
    def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
            query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
            question=hit["_source"].get(self.faq_question_field),
            embedding=hit["_source"].get(self.embedding_field)
        )
        return document
Esempio n. 12
0
def test_context_window_size(test_docs_xs):
    # TODO parametrize window_size and farm/transformers reader using pytest
    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    for window_size in [10, 15, 20]:
        farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
                              use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size)
        prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
        for answer in prediction["answers"]:
            # If the extracted answer is larger than the context window, the context window is expanded.
            # If the extracted answer is odd in length, the resulting context window is one less than context_window_size
            # due to rounding (See FARM's QACandidate)
            # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
            if len(answer["answer"]) <= window_size:
                assert len(answer["context"]) in [window_size, window_size-1]
            else:
                assert len(answer["answer"]) == len(answer["context"])
Esempio n. 13
0
def test_top_k(test_docs_xs):
    # TODO parametrize top_k and farm/transformers reader using pytest
    # TODO transformers reader was crashing when tested on this
    docs = []
    for d in test_docs_xs:
        doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"])
        docs.append(doc)
    farm_reader = FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        num_processes=0,
        use_gpu=False,
        top_k_per_sample=4,
        no_ans_boost=None,
        top_k_per_candidate=4)
    for top_k in [2, 5, 10]:
        prediction = farm_reader.predict(question="Who lives in Berlin?",
                                         documents=docs,
                                         top_k=top_k)
        assert len(prediction["answers"]) == top_k
Esempio n. 14
0
def test_output(reader, test_docs_xs):
    docs = []
    for d in test_docs_xs:
        doc = Document(id=d["name"], text=d["text"], meta=d["meta"])
        docs.append(doc)
    results = reader.predict(question="Who lives in Berlin?",
                             documents=docs,
                             top_k=5)
    assert results is not None
    assert results["question"] == "Who lives in Berlin?"
    assert results["answers"][0]["answer"] == "Carla"
    assert results["answers"][0]["offset_start"] == 11
    #TODO enable again when FARM is upgraded incl. the new offset calc
    # assert results["answers"][0]["offset_end"] == 16
    assert results["answers"][0]["probability"] <= 1
    assert results["answers"][0]["probability"] >= 0
    assert results["answers"][0][
        "context"] == "My name is Carla and I live in Berlin"
    assert results["answers"][0]["document_id"] == "filename1"
    assert len(results["answers"]) == 5
Esempio n. 15
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        if self.faiss_index is not None:
            raise Exception(
                "Addition of more data in an existing index is not supported.")

        faiss_index = self._create_new_index(vector_size=self.vector_size)
        index = index or self.index
        document_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        if add_vectors:
            phi = self._get_phi(document_objects)

        for i in range(0, len(document_objects), self.index_buffer_size):
            if add_vectors:
                embeddings = [
                    doc.embedding
                    for doc in document_objects[i:i + self.index_buffer_size]
                ]
                hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings,
                                                      phi=phi)
                faiss_index.add(hnsw_vectors)

            docs_to_write_in_sql = []
            for vector_id, doc in enumerate(
                    document_objects[i:i + self.index_buffer_size]):
                meta = doc.meta
                if add_vectors:
                    meta["vector_id"] = vector_id
                docs_to_write_in_sql.append(doc)

            super(FAISSDocumentStore,
                  self).write_documents(docs_to_write_in_sql, index=index)
        self.faiss_index = faiss_index
Esempio n. 16
0
    def retrieve(self,
                 query: str,
                 filters: dict = None,
                 top_k: int = 10,
                 index: str = None) -> List[Document]:
        if filters:
            raise NotImplementedError(
                "Filters are not implemented in TfidfRetriever.")
        if index:
            raise NotImplementedError(
                "Switching index is not supported in TfidfRetriever.")

        # get scores
        indices_and_scores = self._calc_scores(query)

        # rank paragraphs
        df_sliced = self.df.loc[indices_and_scores.keys()]  # type: ignore
        df_sliced = df_sliced[:top_k]

        logger.debug(
            f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
        )

        # get actual content for the top candidates
        paragraphs = list(df_sliced.text.values)
        meta_data = [{
            "document_id": row["document_id"],
            "paragraph_id": row["paragraph_id"],
            "meta": row.get("meta", {})
        } for idx, row in df_sliced.iterrows()]

        documents = []
        for para, meta in zip(paragraphs, meta_data):
            documents.append(
                Document(id=meta["paragraph_id"],
                         text=para,
                         meta=meta.get("meta", {})))

        return documents
Esempio n. 17
0
    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        """
        Indexes documents for later queries.


        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
        :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                      separate index than the documents for search.
        :return: None
        """
        index = index or self.index

        documents_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]

        for document in documents_objects:
            self.indexes[index][document.id] = document

            #TODO fix tags after id refactoring
            tags = document.tags
            self._map_tags_to_ids(document.id, tags)
Esempio n. 18
0
def test_document_data_access():
    doc = Document(id=1, text="test")
    assert doc.text == "test"
Esempio n. 19
0
def no_answer_prediction(no_answer_reader, test_docs_xs):
    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5)
    return prediction
Esempio n. 20
0
 def predict_on_texts(self, question: str, texts: [str], top_k=None):
     documents = []
     for i, text in enumerate(texts):
         documents.append(Document(id=i, text=text))
     predictions = self.predict(question, documents, top_k)
     return predictions
Esempio n. 21
0
 def get_all_documents(self) -> List[Document]:
     return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
Esempio n. 22
0
 def get_all_documents(self):
     return [
         Document(id=item[0], text=item[1]['text'], name=item[1]['name'])
         for item in self.docs.items()
     ]
Esempio n. 23
0
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(id=row.id,
                         text=row.text,
                         meta=row.meta_data,
                         tags=row.tags)
     return document
Esempio n. 24
0
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(id=row.id,
                         text=row.text,
                         meta={meta.name: meta.value
                               for meta in row.meta})
     return document