Beispiel #1
0
def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. 
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        if "title" not in data["data"][0]:
            logger.warning(f"No title information found for documents in QA file: {filename}")
        for document in data["data"][:max_docs]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document.get("title", None)}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Beispiel #2
0
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document["title"]}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Beispiel #3
0
def _extract_docs_and_labels_from_dict(document_dict: Dict):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)
        # Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        docs.append(cur_doc)

        # Get Labels
        for qa in paragraph["qas"]:
            if len(qa["answers"]) > 0:
                for answer in qa["answers"]:
                    label = Label(
                        question=qa["question"],
                        answer=answer["text"],
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_doc.id,
                        offset_start_in_doc=answer["answer_start"],
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                label = Label(
                    question=qa["question"],
                    answer="",
                    is_correct_answer=True,
                    is_correct_document=True,
                    document_id=cur_doc.id,
                    offset_start_in_doc=0,
                    no_answer=qa["is_impossible"],
                    origin="gold_label",
                )
                labels.append(label)

    return docs, labels
Beispiel #4
0
    def write_labels(self,
                     labels: Union[List[Label], List[dict]],
                     index: Optional[str] = None):
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        # Make sure we comply to Label class format
        label_objects = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]

        labels_to_index = []
        for label in label_objects:
            _label = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            labels_to_index.append(_label)
        bulk(self.client,
             labels_to_index,
             request_timeout=300,
             refresh=self.refresh_type)
Beispiel #5
0
    def write_labels(self,
                     labels: Union[List[dict], List[Label]],
                     index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]

        duplicate_ids: list = [
            label.id
            for label in self._get_duplicate_labels(label_objects, index=index)
        ]
        if len(duplicate_ids) > 0:
            logger.warning(
                f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
                f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
                f" the answer annotation and not the question."
                f" Problematic ids: {','.join(duplicate_ids)}")

        for label in label_objects:
            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at
            self.indexes[index][label.id] = label
Beispiel #6
0
 def get_all_labels(
         self,
         index: Optional[str] = None,
         filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
     index = index or self.label_index
     result = self.get_all_documents_in_index(index=index, filters=filters)
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels
Beispiel #7
0
    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            self.indexes[index][label_id] = label
Beispiel #8
0
 def get_all_labels(
     self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000
 ) -> List[Label]:
     """
     Return all labels in the document store
     """
     index = index or self.label_index
     result = list(self._get_all_documents_in_index(index=index, filters=filters, batch_size=batch_size))
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels
Beispiel #9
0
    def write_labels(self,
                     labels: Union[List[Label], List[dict]],
                     index: Optional[str] = None,
                     batch_size: int = 10_000):
        """Write annotation labels into document store.

        :param labels: A list of Python dictionaries or a list of Haystack Label objects.
        :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time.
        """
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        labels_to_index = []
        for l in labels:
            # Make sure we comply to Label class format
            if isinstance(l, dict):
                label = Label.from_dict(l)
            else:
                label = l

            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at

            _label = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            # rename id for elastic
            if label.id is not None:
                _label["_id"] = str(_label.pop("id"))

            labels_to_index.append(_label)

            # Pass batch_size number of labels to bulk
            if len(labels_to_index) % batch_size == 0:
                bulk(self.client,
                     labels_to_index,
                     request_timeout=300,
                     refresh=self.refresh_type)
                labels_to_index = []

        if labels_to_index:
            bulk(self.client,
                 labels_to_index,
                 request_timeout=300,
                 refresh=self.refresh_type)
Beispiel #10
0
    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at
            self.indexes[index][label_id] = label
Beispiel #11
0
 def _convert_sql_row_to_label(self, row) -> Label:
     label = Label(
         document_id=row.document_id,
         no_answer=row.no_answer,
         origin=row.origin,
         question=row.question,
         is_correct_answer=row.is_correct_answer,
         is_correct_document=row.is_correct_document,
         answer=row.answer,
         offset_start_in_doc=row.offset_start_in_doc,
         model_id=row.model_id,
     )
     return label
Beispiel #12
0
def test_labels(document_store):
    label = Label(
        question="question",
        answer="answer",
        is_correct_answer=True,
        is_correct_document=True,
        document_id="123",
        offset_start_in_doc=12,
        no_answer=False,
        origin="gold_label",
    )
    document_store.write_labels([label], index="haystack_test_label")
    labels = document_store.get_all_labels(index="haystack_test_label")
    assert len(labels) == 1

    labels = document_store.get_all_labels()
    assert len(labels) == 0
Beispiel #13
0
    def write_labels(self, labels, index=None):

        labels = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]
        index = index or self.label_index
        for label in labels:
            label_orm = LabelORM(
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            self.session.add(label_orm)
        self.session.commit()
Beispiel #14
0
    def write_labels(self, labels, index=None):
        """Write annotation labels into document store."""

        labels = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]
        index = index or self.label_index

        duplicate_ids: list = [
            label.id
            for label in self._get_duplicate_labels(labels, index=index)
        ]
        if len(duplicate_ids) > 0:
            logger.warning(
                f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
                f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
                f" the answer annotation and not the question."
                f" Problematic ids: {','.join(duplicate_ids)}")
        # TODO: Use batch_size
        for label in labels:
            label_orm = LabelORM(
                id=label.id,
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            if label.id in duplicate_ids:
                self.session.merge(label_orm)
            else:
                self.session.add(label_orm)
        self.session.commit()
Beispiel #15
0
def test_multilabel(document_store):
    labels = [
        Label(
            question="question",
            answer="answer1",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="123",
            offset_start_in_doc=12,
            no_answer=False,
            origin="gold_label",
        ),
        # different answer in same doc
        Label(
            question="question",
            answer="answer2",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="123",
            offset_start_in_doc=42,
            no_answer=False,
            origin="gold_label",
        ),
        # answer in different doc
        Label(
            question="question",
            answer="answer3",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="321",
            offset_start_in_doc=7,
            no_answer=False,
            origin="gold_label",
        ),
        # 'no answer', should be excluded from MultiLabel
        Label(
            question="question",
            answer="",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="777",
            offset_start_in_doc=0,
            no_answer=True,
            origin="gold_label",
        ),
        # is_correct_answer=False, should be excluded from MultiLabel
        Label(
            question="question",
            answer="answer5",
            is_correct_answer=False,
            is_correct_document=True,
            document_id="123",
            offset_start_in_doc=99,
            no_answer=True,
            origin="gold_label",
        ),
    ]
    document_store.write_labels(labels, index="haystack_test_multilabel")
    multi_labels = document_store.get_all_labels_aggregated(
        index="haystack_test_multilabel")
    labels = document_store.get_all_labels(index="haystack_test_multilabel")

    assert len(multi_labels) == 1
    assert len(labels) == 5

    assert len(multi_labels[0].multiple_answers) == 3
    assert len(multi_labels[0].multiple_answers) \
           == len(multi_labels[0].multiple_document_ids) \
           == len(multi_labels[0].multiple_offset_start_in_docs)

    multi_labels = document_store.get_all_labels_aggregated()
    assert len(multi_labels) == 0

    # clean up
    document_store.delete_all_documents(index="haystack_test_multilabel")
Beispiel #16
0
def _extract_docs_and_labels_from_dict(document_dict: Dict,
                                       preprocessor: PreProcessor = None):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        ## Create Metadata
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)

        ## Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        if preprocessor is not None:
            splits_dicts = preprocessor.process(cur_doc.to_dict())
            # we need to pull in _split_id into the document id for unique reference in labels
            # todo: PreProcessor should work on Documents instead of dicts
            splits = []
            offset = 0
            for d in splits_dicts:
                id = f"{d['id']}-{d['meta']['_split_id']}"
                d["meta"]["_split_offset"] = offset
                offset += len(d["text"])
                # offset correction based on splitting method
                if preprocessor.split_by == "word":
                    offset += 1
                elif preprocessor.split_by == "passage":
                    offset += 2
                else:
                    raise NotImplementedError
                mydoc = Document(text=d["text"], id=id, meta=d["meta"])
                splits.append(mydoc)
        else:
            splits = [cur_doc]
        docs.extend(splits)

        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
            if not qa["is_impossible"]:
                for answer in qa["answers"]:
                    ans = answer["text"]
                    ans_position = cur_doc.text[
                        answer["answer_start"]:answer["answer_start"] +
                        len(ans)]
                    if ans != ans_position:
                        logger.warning(
                            f"Answer Text and Answer position mismatch. Skipping Answer"
                        )
                        break
                    # find corresponding document or split
                    if len(splits) == 1:
                        cur_id = splits[0].id
                        cur_ans_start = answer["answer_start"]
                    else:
                        for s in splits:
                            # If answer start offset is contained in passage we assign the label to that passage
                            if (answer["answer_start"] >=
                                    s.meta["_split_offset"]) and (
                                        answer["answer_start"] <
                                        (s.meta["_split_offset"] +
                                         len(s.text))):
                                cur_id = s.id
                                cur_ans_start = answer[
                                    "answer_start"] - s.meta["_split_offset"]
                                # If a document is splitting an answer we add the whole answer text to the document
                                if s.text[cur_ans_start:cur_ans_start +
                                          len(ans)] != ans:
                                    s.text = s.text[:cur_ans_start] + ans
                                break
                    label = Label(
                        question=qa["question"],
                        answer=ans,
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_id,
                        offset_start_in_doc=cur_ans_start,
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                # for no_answer we need to assign each split as not fitting to the question
                for s in splits:
                    label = Label(
                        question=qa["question"],
                        answer="",
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=s.id,
                        offset_start_in_doc=0,
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)

    return docs, labels
Beispiel #17
0
def _extract_docs_and_labels_from_dict(document_dict: Dict,
                                       preprocessor: PreProcessor = None,
                                       open_domain: bool = False):
    """Set open_domain to True if you are trying to load open_domain labels (i.e. labels without doc id or start idx)"""
    docs = []
    labels = []
    problematic_ids = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        ## Create Metadata
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)

        ## Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        if preprocessor is not None:
            splits_dicts = preprocessor.process(cur_doc.to_dict())
            # we need to pull in _split_id into the document id for unique reference in labels
            # todo: PreProcessor should work on Documents instead of dicts
            splits: List[Document] = []
            offset = 0
            for d in splits_dicts:
                id = f"{d['id']}-{d['meta']['_split_id']}"
                d["meta"]["_split_offset"] = offset
                offset += len(d["text"])
                # offset correction based on splitting method
                if preprocessor.split_by == "word":
                    offset += 1
                elif preprocessor.split_by == "passage":
                    offset += 2
                else:
                    raise NotImplementedError
                mydoc = Document(text=d["text"], id=id, meta=d["meta"])
                splits.append(mydoc)
        else:
            splits = [cur_doc]
        docs.extend(splits)

        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
            if not qa.get("is_impossible", False):
                for answer in qa["answers"]:
                    ans = answer["text"]
                    cur_ans_start = None
                    # TODO The following block of code means that answer_start is never calculated
                    #  and cur_id is always None for open_domain
                    #  This can be rewritten so that this function could try to calculate offsets
                    #  and populate id in open_domain mode
                    if open_domain:
                        cur_ans_start = answer.get("answer_start", 0)
                        cur_id = '0'
                    else:
                        ans_position = cur_doc.text[
                            answer["answer_start"]:answer["answer_start"] +
                            len(ans)]
                        if ans != ans_position:
                            # do not use answer
                            problematic_ids.append(qa.get("id", "missing"))
                            break
                        # find corresponding document or split
                        if len(splits) == 1:
                            cur_id = splits[0].id
                            cur_ans_start = answer["answer_start"]
                        else:
                            for s in splits:
                                # If answer start offset is contained in passage we assign the label to that passage
                                if (answer["answer_start"] >=
                                        s.meta["_split_offset"]) and (
                                            answer["answer_start"] <
                                            (s.meta["_split_offset"] +
                                             len(s.text))):
                                    cur_id = s.id
                                    cur_ans_start = answer[
                                        "answer_start"] - s.meta[
                                            "_split_offset"]
                                    # If a document is splitting an answer we add the whole answer text to the document
                                    if s.text[cur_ans_start:cur_ans_start +
                                              len(ans)] != ans:
                                        s.text = s.text[:cur_ans_start] + ans
                                    break
                    label = Label(
                        question=qa["question"],
                        answer=ans,
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_id,
                        offset_start_in_doc=cur_ans_start,
                        no_answer=qa.get("is_impossible", False),
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                # for no_answer we need to assign each split as not fitting to the question
                for s in splits:
                    label = Label(
                        question=qa["question"],
                        answer="",
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=s.id,
                        offset_start_in_doc=0,
                        no_answer=qa.get("is_impossible", False),
                        origin="gold_label",
                    )
                    labels.append(label)

    return docs, labels, problematic_ids
Beispiel #18
0
def test_multilabel_no_answer(document_store):
    labels = [
        Label(
            question="question",
            answer="",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="777",
            offset_start_in_doc=0,
            no_answer=True,
            origin="gold_label",
        ),
        # no answer in different doc
        Label(
            question="question",
            answer="",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="123",
            offset_start_in_doc=0,
            no_answer=True,
            origin="gold_label",
        ),
        # no answer in same doc, should be excluded
        Label(
            question="question",
            answer="",
            is_correct_answer=True,
            is_correct_document=True,
            document_id="777",
            offset_start_in_doc=0,
            no_answer=True,
            origin="gold_label",
        ),
        # no answer with is_correct_answer=False, should be excluded
        Label(
            question="question",
            answer="",
            is_correct_answer=False,
            is_correct_document=True,
            document_id="321",
            offset_start_in_doc=0,
            no_answer=True,
            origin="gold_label",
        ),
    ]

    document_store.write_labels(labels,
                                index="haystack_test_multilabel_no_answer")
    multi_labels = document_store.get_all_labels_aggregated(
        index="haystack_test_multilabel_no_answer")
    labels = document_store.get_all_labels(
        index="haystack_test_multilabel_no_answer")

    assert len(multi_labels) == 1
    assert len(labels) == 4

    assert len(multi_labels[0].multiple_document_ids) == 2
    assert len(multi_labels[0].multiple_answers) \
           == len(multi_labels[0].multiple_document_ids) \
           == len(multi_labels[0].multiple_offset_start_in_docs)

    # clean up
    document_store.delete_all_documents(
        index="haystack_test_multilabel_no_answer")