def _convert_sql_row_to_document(self, row) -> Document: document = Document(id=row.id, text=row.text, meta={meta.name: meta.value for meta in row.meta}) if row.vector_id: document.meta["vector_id"] = row.vector_id # type: ignore return document
def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document: # We put all additional data of the doc into meta_data and return it in the API meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)} name = meta_data.pop(self.name_field, None) if name: meta_data["name"] = name score = hit["_score"] if hit["_score"] else None if score: if adapt_score_for_embedding: score -= 1 probability = (score + 1) / 2 # scaling probability from cosine similarity else: probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25 else: probability = None document = Document( id=hit["_id"], text=hit["_source"].get(self.text_field), meta=meta_data, score=score, probability=probability, question=hit["_source"].get(self.faq_question_field), embedding=hit["_source"].get(self.embedding_field) ) return document
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ # Make sure we comply to Document class format document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] index = index or self.index for doc in document_objects: meta_fields = doc.meta or {} vector_id = meta_fields.get("vector_id") meta_orms = [ MetaORM(name=key, value=value) for key, value in meta_fields.items() ] doc_orm = DocumentORM(id=doc.id, text=doc.text, vector_id=vector_id, meta=meta_orms, index=index) self.session.add(doc_orm) self.session.commit()
def predict_on_texts(self, question: str, texts: List[str], top_k: Optional[int] = None): """ Use loaded QA model to find answers for a question in the supplied list of Document. Returns dictionaries containing answers sorted by (desc.) probability. Example: { 'question': 'Who is the father of Arya Stark?', 'answers':[ {'answer': 'Eddard,', 'context': " She travels with her father, Eddard, to King's Landing when he is ", 'offset_answer_start': 147, 'offset_answer_end': 154, 'probability': 0.9787139466668613, 'score': None, 'document_id': '1337' },... ] } :param question: Question string :param documents: List of documents as string type :param top_k: The maximum number of answers to return :return: Dict containing question and answers """ documents = [] for text in texts: documents.append(Document(text=text)) predictions = self.predict(question, documents, top_k) return predictions
def orconvqa_build_corpus(filename: str, limit_lines: int = 0) -> List[Document]: """ :param filename - Name of json file containing the text blocks, each line should be in json format and contain at least a 'text' and 'id' entry :param limit_lines - For testing purposes, use only N lines instead of using the entire corpus. This because creating embeddings for the entire development corpus takes a lot of time. The corpus contains ca. 65000 lines. Make the limit 0 (=zero) to use everything. :return: List of Documents """ docs = [] with open(filename, 'r') as file: for idx, block in enumerate(file.readlines()): if 0 < limit_lines <= idx: # stop reading lines. break try: block = json.loads(block) except: raise ValueError(f'Error occurred reading json block on line {idx} of file: {filename}') cur_meta = {"name": block["title"]} # all other fields on block level f.e. id, aid, bid block_meta = {k: v for k, v in block.items() if k not in ['text', 'id']} cur_meta.update(block_meta) # Create Document cur_doc = Document(id=block['id'], text=block["text"], meta=cur_meta) docs.append(cur_doc) return docs
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries in Elasticsearch. When using explicit document IDs, any existing document with the same ID gets updated. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field. :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. :return: None """ if index and not self.client.indices.exists(index=index): self._create_document_index(index) if index is None: index = self.index # Make sure we comply to Document class format documents_objects = [Document.from_dict(d, field_map=self._create_document_field_map()) if isinstance(d, dict) else d for d in documents] documents_to_index = [] for doc in documents_objects: _doc = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **doc.to_dict(field_map=self._create_document_field_map()) } # type: Dict[str, Any] # cast embedding type as ES cannot deal with np.array if _doc[self.embedding_field] is not None: if type(_doc[self.embedding_field]) == np.ndarray: _doc[self.embedding_field] = _doc[self.embedding_field].tolist() # rename id for elastic _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields _ = _doc.pop("score", None) _ = _doc.pop("probability", None) _doc = {k:v for k,v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, # we "unnest" all value within "meta" if "meta" in _doc.keys(): for k, v in _doc["meta"].items(): _doc[k] = v _doc.pop("meta") documents_to_index.append(_doc) bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in SQuAD format :return: (List of Documents, List of Labels) """ docs = [] labels = [] with open(filename, "r") as file: data = json.load(file) for document in data["data"]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: cur_meta = {"name": document["title"]} # all other fields from paragraph level meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")} cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def CoQA_read_file(filename: str) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a CoQA style file Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in CoQA format :return: (List of Documents, List of Labels) """ docs = [] labels = [] with open(filename, "r") as file: data = json.load(file) for document in data["data"]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("questions", "answers")} cur_doc = Document(id=document['id'], text=document["story"], meta=meta_doc) docs.append(cur_doc) # Get Labels for q, a in zip(document["questions"], document['answers']): label = Label( question=q["input_text"], # TODO these are very short answers and may not allways match with the span_start # The retrieved answer on span_text is longer and input_text is taken from that answer=a['input_text'], is_correct_answer=True, is_correct_document=True, # We do not do an extra check if the document id exists in the corpus, this may cause issues later document_id=cur_doc.id, offset_start_in_doc=a["span_start"], origin=filename, previous_questions_in_conversation=[pq['input_text'] for pq in document['questions'] if pq['turn_id'] < q['turn_id']] ) labels.append(label) return docs, labels
def train_index(self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None): """ Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. The train vectors should come from the same distribution as your final ones. You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. :param documents: Documents (incl. the embeddings) :param embeddings: Plain embeddings :return: None """ if embeddings and documents: raise ValueError( "Either pass `documents` or `embeddings`. You passed both.") if documents: document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] embeddings = [doc.embedding for doc in document_objects] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.train(embeddings)