def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in SQuAD format :param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. :return: (List of Documents, List of Labels) """ docs = [] labels = [] with open(filename, "r") as file: data = json.load(file) if "title" not in data["data"][0]: logger.warning(f"No title information found for documents in QA file: {filename}") for document in data["data"][:max_docs]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: cur_meta = {"name": document.get("title", None)} # all other fields from paragraph level meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")} cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in SQuAD format :return: (List of Documents, List of Labels) """ docs = [] labels = [] with open(filename, "r") as file: data = json.load(file) for document in data["data"]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: cur_meta = {"name": document["title"]} # all other fields from paragraph level meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")} cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def _extract_docs_and_labels_from_dict(document_dict: Dict): docs = [] labels = [] # get all extra fields from document level (e.g. title) meta_doc = { k: v for k, v in document_dict.items() if k not in ("paragraphs", "title") } for paragraph in document_dict["paragraphs"]: cur_meta = {"name": document_dict.get("title", None)} # all other fields from paragraph level meta_paragraph = { k: v for k, v in paragraph.items() if k not in ("qas", "context") } cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None): index = index or self.label_index if index and not self.client.indices.exists(index=index): self._create_label_index(index) # Make sure we comply to Label class format label_objects = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] labels_to_index = [] for label in label_objects: _label = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **label.to_dict() } # type: Dict[str, Any] labels_to_index.append(_label) bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] duplicate_ids: list = [ label.id for label in self._get_duplicate_labels(label_objects, index=index) ] if len(duplicate_ids) > 0: logger.warning( f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store." f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of" f" the answer annotation and not the question." f" Problematic ids: {','.join(duplicate_ids)}") for label in label_objects: # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at self.indexes[index][label.id] = label
def get_all_labels( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]: index = index or self.label_index result = self.get_all_documents_in_index(index=index, filters=filters) labels = [Label.from_dict(hit["_source"]) for hit in result] return labels
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) self.indexes[index][label_id] = label
def get_all_labels( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000 ) -> List[Label]: """ Return all labels in the document store """ index = index or self.label_index result = list(self._get_all_documents_in_index(index=index, filters=filters, batch_size=batch_size)) labels = [Label.from_dict(hit["_source"]) for hit in result] return labels
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None, batch_size: int = 10_000): """Write annotation labels into document store. :param labels: A list of Python dictionaries or a list of Haystack Label objects. :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time. """ index = index or self.label_index if index and not self.client.indices.exists(index=index): self._create_label_index(index) labels_to_index = [] for l in labels: # Make sure we comply to Label class format if isinstance(l, dict): label = Label.from_dict(l) else: label = l # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at _label = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **label.to_dict() } # type: Dict[str, Any] # rename id for elastic if label.id is not None: _label["_id"] = str(_label.pop("id")) labels_to_index.append(_label) # Pass batch_size number of labels to bulk if len(labels_to_index) % batch_size == 0: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type) labels_to_index = [] if labels_to_index: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at self.indexes[index][label_id] = label
def _convert_sql_row_to_label(self, row) -> Label: label = Label( document_id=row.document_id, no_answer=row.no_answer, origin=row.origin, question=row.question, is_correct_answer=row.is_correct_answer, is_correct_document=row.is_correct_document, answer=row.answer, offset_start_in_doc=row.offset_start_in_doc, model_id=row.model_id, ) return label
def test_labels(document_store): label = Label( question="question", answer="answer", is_correct_answer=True, is_correct_document=True, document_id="123", offset_start_in_doc=12, no_answer=False, origin="gold_label", ) document_store.write_labels([label], index="haystack_test_label") labels = document_store.get_all_labels(index="haystack_test_label") assert len(labels) == 1 labels = document_store.get_all_labels() assert len(labels) == 0
def write_labels(self, labels, index=None): labels = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] index = index or self.label_index for label in labels: label_orm = LabelORM( document_id=label.document_id, no_answer=label.no_answer, origin=label.origin, question=label.question, is_correct_answer=label.is_correct_answer, is_correct_document=label.is_correct_document, answer=label.answer, offset_start_in_doc=label.offset_start_in_doc, model_id=label.model_id, index=index, ) self.session.add(label_orm) self.session.commit()
def write_labels(self, labels, index=None): """Write annotation labels into document store.""" labels = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] index = index or self.label_index duplicate_ids: list = [ label.id for label in self._get_duplicate_labels(labels, index=index) ] if len(duplicate_ids) > 0: logger.warning( f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store." f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of" f" the answer annotation and not the question." f" Problematic ids: {','.join(duplicate_ids)}") # TODO: Use batch_size for label in labels: label_orm = LabelORM( id=label.id, document_id=label.document_id, no_answer=label.no_answer, origin=label.origin, question=label.question, is_correct_answer=label.is_correct_answer, is_correct_document=label.is_correct_document, answer=label.answer, offset_start_in_doc=label.offset_start_in_doc, model_id=label.model_id, index=index, ) if label.id in duplicate_ids: self.session.merge(label_orm) else: self.session.add(label_orm) self.session.commit()
def test_multilabel(document_store): labels = [ Label( question="question", answer="answer1", is_correct_answer=True, is_correct_document=True, document_id="123", offset_start_in_doc=12, no_answer=False, origin="gold_label", ), # different answer in same doc Label( question="question", answer="answer2", is_correct_answer=True, is_correct_document=True, document_id="123", offset_start_in_doc=42, no_answer=False, origin="gold_label", ), # answer in different doc Label( question="question", answer="answer3", is_correct_answer=True, is_correct_document=True, document_id="321", offset_start_in_doc=7, no_answer=False, origin="gold_label", ), # 'no answer', should be excluded from MultiLabel Label( question="question", answer="", is_correct_answer=True, is_correct_document=True, document_id="777", offset_start_in_doc=0, no_answer=True, origin="gold_label", ), # is_correct_answer=False, should be excluded from MultiLabel Label( question="question", answer="answer5", is_correct_answer=False, is_correct_document=True, document_id="123", offset_start_in_doc=99, no_answer=True, origin="gold_label", ), ] document_store.write_labels(labels, index="haystack_test_multilabel") multi_labels = document_store.get_all_labels_aggregated( index="haystack_test_multilabel") labels = document_store.get_all_labels(index="haystack_test_multilabel") assert len(multi_labels) == 1 assert len(labels) == 5 assert len(multi_labels[0].multiple_answers) == 3 assert len(multi_labels[0].multiple_answers) \ == len(multi_labels[0].multiple_document_ids) \ == len(multi_labels[0].multiple_offset_start_in_docs) multi_labels = document_store.get_all_labels_aggregated() assert len(multi_labels) == 0 # clean up document_store.delete_all_documents(index="haystack_test_multilabel")
def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None): docs = [] labels = [] # get all extra fields from document level (e.g. title) meta_doc = { k: v for k, v in document_dict.items() if k not in ("paragraphs", "title") } for paragraph in document_dict["paragraphs"]: ## Create Metadata cur_meta = {"name": document_dict.get("title", None)} # all other fields from paragraph level meta_paragraph = { k: v for k, v in paragraph.items() if k not in ("qas", "context") } cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) ## Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) if preprocessor is not None: splits_dicts = preprocessor.process(cur_doc.to_dict()) # we need to pull in _split_id into the document id for unique reference in labels # todo: PreProcessor should work on Documents instead of dicts splits = [] offset = 0 for d in splits_dicts: id = f"{d['id']}-{d['meta']['_split_id']}" d["meta"]["_split_offset"] = offset offset += len(d["text"]) # offset correction based on splitting method if preprocessor.split_by == "word": offset += 1 elif preprocessor.split_by == "passage": offset += 2 else: raise NotImplementedError mydoc = Document(text=d["text"], id=id, meta=d["meta"]) splits.append(mydoc) else: splits = [cur_doc] docs.extend(splits) ## Assign Labels to corresponding documents for qa in paragraph["qas"]: if not qa["is_impossible"]: for answer in qa["answers"]: ans = answer["text"] ans_position = cur_doc.text[ answer["answer_start"]:answer["answer_start"] + len(ans)] if ans != ans_position: logger.warning( f"Answer Text and Answer position mismatch. Skipping Answer" ) break # find corresponding document or split if len(splits) == 1: cur_id = splits[0].id cur_ans_start = answer["answer_start"] else: for s in splits: # If answer start offset is contained in passage we assign the label to that passage if (answer["answer_start"] >= s.meta["_split_offset"]) and ( answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))): cur_id = s.id cur_ans_start = answer[ "answer_start"] - s.meta["_split_offset"] # If a document is splitting an answer we add the whole answer text to the document if s.text[cur_ans_start:cur_ans_start + len(ans)] != ans: s.text = s.text[:cur_ans_start] + ans break label = Label( question=qa["question"], answer=ans, is_correct_answer=True, is_correct_document=True, document_id=cur_id, offset_start_in_doc=cur_ans_start, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: # for no_answer we need to assign each split as not fitting to the question for s in splits: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=s.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None, open_domain: bool = False): """Set open_domain to True if you are trying to load open_domain labels (i.e. labels without doc id or start idx)""" docs = [] labels = [] problematic_ids = [] # get all extra fields from document level (e.g. title) meta_doc = { k: v for k, v in document_dict.items() if k not in ("paragraphs", "title") } for paragraph in document_dict["paragraphs"]: ## Create Metadata cur_meta = {"name": document_dict.get("title", None)} # all other fields from paragraph level meta_paragraph = { k: v for k, v in paragraph.items() if k not in ("qas", "context") } cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) ## Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) if preprocessor is not None: splits_dicts = preprocessor.process(cur_doc.to_dict()) # we need to pull in _split_id into the document id for unique reference in labels # todo: PreProcessor should work on Documents instead of dicts splits: List[Document] = [] offset = 0 for d in splits_dicts: id = f"{d['id']}-{d['meta']['_split_id']}" d["meta"]["_split_offset"] = offset offset += len(d["text"]) # offset correction based on splitting method if preprocessor.split_by == "word": offset += 1 elif preprocessor.split_by == "passage": offset += 2 else: raise NotImplementedError mydoc = Document(text=d["text"], id=id, meta=d["meta"]) splits.append(mydoc) else: splits = [cur_doc] docs.extend(splits) ## Assign Labels to corresponding documents for qa in paragraph["qas"]: if not qa.get("is_impossible", False): for answer in qa["answers"]: ans = answer["text"] cur_ans_start = None # TODO The following block of code means that answer_start is never calculated # and cur_id is always None for open_domain # This can be rewritten so that this function could try to calculate offsets # and populate id in open_domain mode if open_domain: cur_ans_start = answer.get("answer_start", 0) cur_id = '0' else: ans_position = cur_doc.text[ answer["answer_start"]:answer["answer_start"] + len(ans)] if ans != ans_position: # do not use answer problematic_ids.append(qa.get("id", "missing")) break # find corresponding document or split if len(splits) == 1: cur_id = splits[0].id cur_ans_start = answer["answer_start"] else: for s in splits: # If answer start offset is contained in passage we assign the label to that passage if (answer["answer_start"] >= s.meta["_split_offset"]) and ( answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))): cur_id = s.id cur_ans_start = answer[ "answer_start"] - s.meta[ "_split_offset"] # If a document is splitting an answer we add the whole answer text to the document if s.text[cur_ans_start:cur_ans_start + len(ans)] != ans: s.text = s.text[:cur_ans_start] + ans break label = Label( question=qa["question"], answer=ans, is_correct_answer=True, is_correct_document=True, document_id=cur_id, offset_start_in_doc=cur_ans_start, no_answer=qa.get("is_impossible", False), origin="gold_label", ) labels.append(label) else: # for no_answer we need to assign each split as not fitting to the question for s in splits: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=s.id, offset_start_in_doc=0, no_answer=qa.get("is_impossible", False), origin="gold_label", ) labels.append(label) return docs, labels, problematic_ids
def test_multilabel_no_answer(document_store): labels = [ Label( question="question", answer="", is_correct_answer=True, is_correct_document=True, document_id="777", offset_start_in_doc=0, no_answer=True, origin="gold_label", ), # no answer in different doc Label( question="question", answer="", is_correct_answer=True, is_correct_document=True, document_id="123", offset_start_in_doc=0, no_answer=True, origin="gold_label", ), # no answer in same doc, should be excluded Label( question="question", answer="", is_correct_answer=True, is_correct_document=True, document_id="777", offset_start_in_doc=0, no_answer=True, origin="gold_label", ), # no answer with is_correct_answer=False, should be excluded Label( question="question", answer="", is_correct_answer=False, is_correct_document=True, document_id="321", offset_start_in_doc=0, no_answer=True, origin="gold_label", ), ] document_store.write_labels(labels, index="haystack_test_multilabel_no_answer") multi_labels = document_store.get_all_labels_aggregated( index="haystack_test_multilabel_no_answer") labels = document_store.get_all_labels( index="haystack_test_multilabel_no_answer") assert len(multi_labels) == 1 assert len(labels) == 4 assert len(multi_labels[0].multiple_document_ids) == 2 assert len(multi_labels[0].multiple_answers) \ == len(multi_labels[0].multiple_document_ids) \ == len(multi_labels[0].multiple_offset_start_in_docs) # clean up document_store.delete_all_documents( index="haystack_test_multilabel_no_answer")