Example #1
0
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(id=row.id,
                         text=row.text,
                         meta={meta.name: meta.value
                               for meta in row.meta})
     if row.vector_id:
         document.meta["vector_id"] = row.vector_id  # type: ignore
     return document
Example #2
0
    def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

        score = hit["_score"] if hit["_score"] else None
        if score:
            if adapt_score_for_embedding:
                score -= 1
                probability = (score + 1) / 2  # scaling probability from cosine similarity
            else:
                probability = float(expit(np.asarray(score / 8)))  # scaling probability from TFIDF/BM25
        else:
            probability = None
        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
            score=score,
            probability=probability,
            question=hit["_source"].get(self.faq_question_field),
            embedding=hit["_source"].get(self.embedding_field)
        )
        return document
Example #3
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries.
      :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
                      documents for evaluation can be indexed in a separate index than the documents for search.
        :return: None
        """

        # Make sure we comply to Document class format
        document_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]
        index = index or self.index
        for doc in document_objects:
            meta_fields = doc.meta or {}
            vector_id = meta_fields.get("vector_id")
            meta_orms = [
                MetaORM(name=key, value=value)
                for key, value in meta_fields.items()
            ]
            doc_orm = DocumentORM(id=doc.id,
                                  text=doc.text,
                                  vector_id=vector_id,
                                  meta=meta_orms,
                                  index=index)
            self.session.add(doc_orm)
        self.session.commit()
Example #4
0
    def predict_on_texts(self,
                         question: str,
                         texts: List[str],
                         top_k: Optional[int] = None):
        """
        Use loaded QA model to find answers for a question in the supplied list of Document.
        Returns dictionaries containing answers sorted by (desc.) probability.
        Example:

            {
                'question': 'Who is the father of Arya Stark?',
                'answers':[
                             {'answer': 'Eddard,',
                             'context': " She travels with her father, Eddard, to King's Landing when he is ",
                             'offset_answer_start': 147,
                             'offset_answer_end': 154,
                             'probability': 0.9787139466668613,
                             'score': None,
                             'document_id': '1337'
                             },...
                          ]
            }

        :param question: Question string
        :param documents: List of documents as string type
        :param top_k: The maximum number of answers to return
        :return: Dict containing question and answers
        """
        documents = []
        for text in texts:
            documents.append(Document(text=text))
        predictions = self.predict(question, documents, top_k)
        return predictions
Example #5
0
def orconvqa_build_corpus(filename: str, limit_lines: int = 0) -> List[Document]:
    """
    :param filename - Name of json file containing the text blocks, each line should be in json format and contain
        at least a 'text' and 'id' entry
    :param limit_lines - For testing purposes, use only N lines instead of using the entire corpus.
        This because creating embeddings for the entire development corpus takes a lot of time. The corpus
        contains ca. 65000 lines. Make the limit 0 (=zero) to use everything.
    :return: List of Documents
    """

    docs = []
    with open(filename, 'r') as file:
        for idx, block in enumerate(file.readlines()):
            if 0 < limit_lines <= idx:
                # stop reading lines.
                break
            try:
                block = json.loads(block)
            except:
                raise ValueError(f'Error occurred reading json block on line {idx} of file: {filename}')

            cur_meta = {"name": block["title"]}
            # all other fields on block level f.e. id, aid, bid
            block_meta = {k: v for k, v in block.items() if k not in ['text', 'id']}
            cur_meta.update(block_meta)

            # Create Document
            cur_doc = Document(id=block['id'], text=block["text"], meta=cur_meta)
            docs.append(cur_doc)
    return docs
Example #6
0
    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        """
        Indexes documents for later queries in Elasticsearch.

        When using explicit document IDs, any existing document with the same ID gets updated.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :return: None
        """

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index

        # Make sure we comply to Document class format
        documents_objects = [Document.from_dict(d, field_map=self._create_document_field_map())
                             if isinstance(d, dict) else d for d in documents]

        documents_to_index = []
        for doc in documents_objects:

            _doc = {
                "_op_type": "index" if self.update_existing_documents else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map())
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _ = _doc.pop("probability", None)
            _doc = {k:v for k,v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)
        bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
Example #7
0
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document["title"]}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Example #8
0
def CoQA_read_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a CoQA style file
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in CoQA format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("questions", "answers")}
            cur_doc = Document(id=document['id'], text=document["story"], meta=meta_doc)

            docs.append(cur_doc)
            # Get Labels
            for q, a in zip(document["questions"], document['answers']):

                label = Label(
                    question=q["input_text"],
                    # TODO these are very short answers and may not allways match with the span_start
                    # The retrieved answer on span_text is longer and input_text is taken from that
                    answer=a['input_text'],
                    is_correct_answer=True,
                    is_correct_document=True,
                    # We do not do an extra check if the document id exists in the corpus, this may cause issues later
                    document_id=cur_doc.id,
                    offset_start_in_doc=a["span_start"],
                    origin=filename,
                    previous_questions_in_conversation=[pq['input_text'] for pq in document['questions'] if pq['turn_id'] < q['turn_id']]
                )
                labels.append(label)

    return docs, labels
Example #9
0
    def train_index(self,
                    documents: Optional[Union[List[dict], List[Document]]],
                    embeddings: Optional[np.array] = None):
        """
        Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
        The train vectors should come from the same distribution as your final ones.
        You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on.
        :param documents: Documents (incl. the embeddings)
        :param embeddings: Plain embeddings
        :return: None
        """

        if embeddings and documents:
            raise ValueError(
                "Either pass `documents` or `embeddings`. You passed both.")
        if documents:
            document_objects = [
                Document.from_dict(d) if isinstance(d, dict) else d
                for d in documents
            ]
            embeddings = [doc.embedding for doc in document_objects]
            embeddings = np.array(embeddings, dtype="float32")
        self.faiss_index.train(embeddings)