Exemple #1
0
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(id=row.id,
                         text=row.text,
                         meta={meta.name: meta.value
                               for meta in row.meta})
     if row.vector_id:
         document.meta["vector_id"] = row.vector_id
     return document
Exemple #2
0
    def query_by_embedding(self,
                           query_emb: List[float],
                           filters: Optional[Dict[str, List[str]]] = None,
                           top_k: int = 10,
                           index: Optional[str] = None,
                           return_embedding: Optional[bool] = None) -> List[Document]:

        """
        Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.

        :param query_emb: Embedding of the query (e.g. gathered from DPR)
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param top_k: How many documents to return
        :param index: Index name for storing the docs and metadata
        :param return_embedding: To return document embedding
        :return:
        """

        from numpy import dot
        from numpy.linalg import norm

        if filters:
            raise NotImplementedError("Setting `filters` is currently not supported in "
                                      "InMemoryDocumentStore.query_by_embedding(). Please remove filters or "
                                      "use a different DocumentStore (e.g. ElasticsearchDocumentStore).")

        index = index or self.index
        if return_embedding is None:
            return_embedding = self.return_embedding

        if query_emb is None:
            return []

        candidate_docs = []
        for idx, doc in self.indexes[index].items():
            curr_meta = deepcopy(doc.meta)
            new_document = Document(
                id=doc.id,
                text=doc.text,
                meta=curr_meta,
                embedding=doc.embedding
            )
            new_document.embedding = doc.embedding if return_embedding is True else None

            if self.similarity == "dot_product":
                score = dot(query_emb, doc.embedding) / (
                    norm(query_emb) * norm(doc.embedding)
                )
            elif self.similarity == "cosine":
                # cosine similarity score = 1 - cosine distance
                score = 1 - cosine(query_emb, doc.embedding)
            new_document.score = score
            new_document.probability = (score + 1) / 2
            candidate_docs.append(new_document)

        return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k]
Exemple #3
0
    def predict_on_texts(self,
                         question: str,
                         texts: List[str],
                         top_k: Optional[int] = None):
        """
        Use loaded QA model to find answers for a question in the supplied list of Document.
        Returns dictionaries containing answers sorted by (desc.) probability.
        Example:
         ```python
            |{
            |    'question': 'Who is the father of Arya Stark?',
            |    'answers':[
            |                 {'answer': 'Eddard,',
            |                 'context': " She travels with her father, Eddard, to King's Landing when he is ",
            |                 'offset_answer_start': 147,
            |                 'offset_answer_end': 154,
            |                 'probability': 0.9787139466668613,
            |                 'score': None,
            |                 'document_id': '1337'
            |                 },...
            |              ]
            |}
         ```

        :param question: Question string
        :param documents: List of documents as string type
        :param top_k: The maximum number of answers to return
        :return: Dict containing question and answers
        """
        documents = []
        for text in texts:
            documents.append(Document(text=text))
        predictions = self.predict(question, documents, top_k)
        return predictions
Exemple #4
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries.

      :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
                      documents for evaluation can be indexed in a separate index than the documents for search.

        :return: None
        """

        index = index or self.index
        if len(documents) == 0:
            return
        # Make sure we comply to Document class format
        if isinstance(documents[0], dict):
            document_objects = [
                Document.from_dict(d) if isinstance(d, dict) else d
                for d in documents
            ]
        else:
            document_objects = documents

        for i in range(0, len(document_objects), self.batch_size):
            for doc in document_objects[i:i + self.batch_size]:
                meta_fields = doc.meta or {}
                vector_id = meta_fields.get("vector_id")
                meta_orms = [
                    MetaORM(name=key, value=value)
                    for key, value in meta_fields.items()
                ]
                doc_orm = DocumentORM(id=doc.id,
                                      text=doc.text,
                                      vector_id=vector_id,
                                      meta=meta_orms,
                                      index=index)
                if self.update_existing_documents:
                    # First old meta data cleaning is required
                    self.session.query(MetaORM).filter_by(
                        document_id=doc.id).delete()
                    self.session.merge(doc_orm)
                else:
                    self.session.add(doc_orm)
            try:
                self.session.commit()
            except Exception as ex:
                logger.error(f"Transaction rollback: {ex.__cause__}")
                # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail
                self.session.rollback()
                raise ex
Exemple #5
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                          them right away in FAISS. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :return:
        """
        # vector index
        if not self.faiss_index:
            raise ValueError(
                "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..."
            )

        # doc + metadata index
        index = index or self.index
        field_map = self._create_document_field_map()
        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        if self.update_existing_documents and add_vectors:
            logger.warning(
                "You have enabled `update_existing_documents` feature and "
                "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n"
                "Please call `update_embeddings` method to repopulate `faiss_index`"
            )

        for i in range(0, len(document_objects), self.index_buffer_size):
            vector_id = self.faiss_index.ntotal
            if add_vectors:
                embeddings = [
                    doc.embedding
                    for doc in document_objects[i:i + self.index_buffer_size]
                ]
                embeddings = np.array(embeddings, dtype="float32")
                self.faiss_index.add(embeddings)

            docs_to_write_in_sql = []
            for doc in document_objects[i:i + self.index_buffer_size]:
                meta = doc.meta
                if add_vectors:
                    meta["vector_id"] = vector_id
                    vector_id += 1
                docs_to_write_in_sql.append(doc)

            super(FAISSDocumentStore,
                  self).write_documents(docs_to_write_in_sql, index=index)
Exemple #6
0
def _extract_docs_and_labels_from_dict(document_dict: Dict):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)
        # Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        docs.append(cur_doc)

        # Get Labels
        for qa in paragraph["qas"]:
            if len(qa["answers"]) > 0:
                for answer in qa["answers"]:
                    label = Label(
                        question=qa["question"],
                        answer=answer["text"],
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_doc.id,
                        offset_start_in_doc=answer["answer_start"],
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                label = Label(
                    question=qa["question"],
                    answer="",
                    is_correct_answer=True,
                    is_correct_document=True,
                    document_id=cur_doc.id,
                    offset_start_in_doc=0,
                    no_answer=qa["is_impossible"],
                    origin="gold_label",
                )
                labels.append(label)

    return docs, labels
Exemple #7
0
    def get_all_documents(
            self,
            index: Optional[str] = None,
            filters: Optional[Dict[str, List[str]]] = None,
            return_embedding: Optional[bool] = None) -> List[Document]:
        """
        Get documents from the document store.

        :param index: Name of the index to get the documents from. If None, the
                      DocumentStore's default index (self.index) will be used.
        :param filters: Optional filters to narrow down the documents to return.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param return_embedding: Whether to return the document embeddings.
        """

        index = index or self.index
        # Generally ORM objects kept in memory cause performance issue
        # Hence using directly column name improve memory and performance.
        # Refer https://stackoverflow.com/questions/23185319/why-is-loading-sqlalchemy-objects-via-the-orm-5-8x-slower-than-rows-via-a-raw-my
        documents_query = self.session.query(
            DocumentORM.id, DocumentORM.text,
            DocumentORM.vector_id).filter_by(index=index)

        if filters:
            documents_query = documents_query.join(MetaORM)
            for key, values in filters.items():
                documents_query = documents_query.filter(
                    MetaORM.name == key, MetaORM.value.in_(values),
                    DocumentORM.id == MetaORM.document_id)

        documents_map = {}
        for row in documents_query.all():
            documents_map[row.id] = Document(
                id=row.id,
                text=row.text,
                meta=None if row.vector_id is None else
                {"vector_id": row.vector_id}  # type: ignore
            )

        for doc_ids in self.chunked_iterable(documents_map.keys(),
                                             size=self.batch_size):
            meta_query = self.session.query(
                MetaORM.document_id, MetaORM.name,
                MetaORM.value).filter(MetaORM.document_id.in_(doc_ids))

            for row in meta_query.all():
                if documents_map[row.document_id].meta is None:
                    documents_map[row.document_id].meta = {}
                documents_map[row.document_id].meta[
                    row.name] = row.value  # type: ignore

        return list(documents_map.values())
Exemple #8
0
    def _convert_es_hit_to_document(
        self,
        hit: dict,
        return_embedding: bool,
        adapt_score_for_embedding: bool = False,
    ) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {
            k: v
            for k, v in hit["_source"].items()
            if k not in (self.text_field, self.faq_question_field,
                         self.embedding_field)
        }
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

        score = hit["_score"] if hit["_score"] else None
        if score:
            if adapt_score_for_embedding:
                score = self._scale_embedding_score(score)
                if self.similarity == "cosine":
                    probability = (
                        score +
                        1) / 2  # scaling probability from cosine similarity
                elif self.similarity == "dot_product":
                    probability = float(expit(np.asarray(
                        score / 100)))  # scaling probability from dot product
            else:
                probability = float(expit(np.asarray(
                    score / 8)))  # scaling probability from TFIDF/BM25
        else:
            probability = None

        embedding = None
        if return_embedding:
            embedding_list = hit["_source"].get(self.embedding_field)
            if embedding_list:
                embedding = np.asarray(embedding_list, dtype=np.float32)

        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
            score=score,
            probability=probability,
            question=hit["_source"].get(self.faq_question_field),
            embedding=embedding,
        )
        return document
Exemple #9
0
    def predict(self,
                documents: List[Document],
                generate_single_summary: bool = False) -> List[Document]:
        """
        Produce the summarization from the supplied documents.
        These document can for example be retrieved via the Retriever.

        :param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
        :param generate_single_summary: Whether to generate a single summary for all documents or one summary per document.
                                        If set to "True", all docs will be joined to a single string that will then
                                        be summarized.
                                        Important: The summary will depend on the order of the supplied documents!
        :return: List of Documents, where Document.text contains the summarization and Document.meta["context"]
                 the original, not summarized text
        """

        if self.min_length > self.max_length:
            raise AttributeError(
                "min_length cannot be greater than max_length")

        if len(documents) == 0:
            raise AttributeError(
                "Summarizer needs at least one document to produce a summary.")

        contexts: List[str] = [doc.text for doc in documents]

        if generate_single_summary:
            # Documents order is very important to produce summary.
            # Different order of same documents produce different summary.
            contexts = [self.separator_for_single_summary.join(contexts)]

        summaries = self.summarizer(
            contexts,
            min_length=self.min_length,
            max_length=self.max_length,
            return_text=True,
            clean_up_tokenization_spaces=self.clean_up_tokenization_spaces,
        )

        result: List[Document] = []

        for context, summarized_answer in zip(contexts, summaries):
            cur_doc = Document(text=summarized_answer['summary_text'],
                               meta={"context": context})
            result.append(cur_doc)

        return result
Exemple #10
0
    def retrieve(self, query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query.

        :param query: The query
        :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
        :param top_k: How many documents to return per query.
        :param index: The name of the index in the DocumentStore from which to retrieve documents
        """
        if self.df is None:
            raise Exception("fit() needs to called before retrieve()")

        if filters:
            raise NotImplementedError("Filters are not implemented in TfidfRetriever.")
        if index:
            raise NotImplementedError("Switching index is not supported in TfidfRetriever.")

        # get scores
        indices_and_scores = self._calc_scores(query)

        # rank paragraphs
        df_sliced = self.df.loc[indices_and_scores.keys()]
        df_sliced = df_sliced[:top_k]

        logger.debug(
            f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
        )

        # get actual content for the top candidates
        paragraphs = list(df_sliced.text.values)
        meta_data = [{"document_id": row["document_id"], "paragraph_id": row["paragraph_id"],  "meta": row.get("meta", {})}
                     for idx, row in df_sliced.iterrows()]

        documents = []
        for para, meta in zip(paragraphs, meta_data):
            documents.append(
                Document(
                    id=meta["document_id"],
                    text=para,
                    meta=meta.get("meta", {})
                ))

        return documents
Exemple #11
0
    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        """
        Indexes documents for later queries.


       :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta": {"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                      separate index than the documents for search.
        :return: None
        """
        index = index or self.index

        field_map = self._create_document_field_map()
        documents_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]

        for document in documents_objects:
            self.indexes[index][document.id] = document
Exemple #12
0
    def train_index(self,
                    documents: Optional[Union[List[dict], List[Document]]],
                    embeddings: Optional[np.array] = None):
        """
        Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
        The train vectors should come from the same distribution as your final ones.
        You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on.

        :param documents: Documents (incl. the embeddings)
        :param embeddings: Plain embeddings
        :return: None
        """

        if embeddings and documents:
            raise ValueError(
                "Either pass `documents` or `embeddings`. You passed both.")
        if documents:
            document_objects = [
                Document.from_dict(d) if isinstance(d, dict) else d
                for d in documents
            ]
            embeddings = [doc.embedding for doc in document_objects]
            embeddings = np.array(embeddings, dtype="float32")
        self.faiss_index.train(embeddings)
Exemple #13
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: Optional[int] = None):
        """
        Indexes documents for later queries in Elasticsearch.

        Behaviour if a document with the same ID already exists in ElasticSearch:
        a) (Default) Throw Elastic's standard error message for duplicate IDs.
        b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
        (This is only relevant if you pass your own ID when initializing a `Document`.
        If don't set custom IDs for your Documents or just pass a list of dictionaries here,
        they will automatically get UUIDs assigned. See the `Document` class for details)

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
                           If `None`, all documents will be passed to bulk at once.
        :return: None
        """

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index

        documents_to_index = []
        for document in documents:
            # Make sure we comply to Document class format
            if isinstance(document, dict):
                doc = Document.from_dict(
                    document, field_map=self._create_document_field_map())
            else:
                doc = document

            _doc = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map())
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[
                        self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _ = _doc.pop("probability", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)

            if batch_size is not None:
                # Pass batch_size number of documents to bulk
                if len(documents_to_index) % batch_size == 0:
                    bulk(self.client,
                         documents_to_index,
                         request_timeout=300,
                         refresh=self.refresh_type)
                    documents_to_index = []

        if documents_to_index:
            bulk(self.client,
                 documents_to_index,
                 request_timeout=300,
                 refresh=self.refresh_type)