def add_candidate_types_and_positions(json_dict, max_position): """Adds type and position info to each candidate in the document.""" count = 0 for _, cand in data.candidates_iter(json_dict): if count < max_position: count += 1 cand["type_and_position"] = "[Paragraph=%d]" % count
def create_entry_from_json(json_dict, max_passages, max_position, fail_on_invalid): """Creates an TyDi 'entry' from the raw JSON. The 'TyDiEntry' dict is an intermediate format that is later converted into the main `TyDiExample` format. This function looks up the chunks of text that are candidates for the passage answer task, inserts special context tokens such as "[ContextId=0]", and creates a byte index to byte index mapping between the document plaintext and the concatenation of the passage candidates (these could potentially exclude parts of the plaintext document and also include the special tokens). In the returned entry, `contexts` includes only the candidate passages and has special tokens such as [ContextId=0] added. `span_start` and `span_end` are byte-wise indices into `contexts` (not the original corpus plaintext). Args: json_dict: A single JSONL line, deserialized into a dict. max_passages: see FLAGS.max_passages. max_position: see FLAGS.max_position. fail_on_invalid: Immediately stop if an error is found? Returns: If a failure was encountered and `fail_on_invalid=False`, then returns an empty `dict`. Otherwise returns: 'TyDiEntry' type: a dict-based format consumed by downstream functions: entry = { "name": str, "id": str, "language": str, "question": {"input_text": str}, "answer": { "candidate_id": annotated_idx, "span_text": "", "span_start": -1, "span_end": -1, "input_text": "passage", } "has_correct_context": bool, # Includes special tokens appended. "contexts": str, # Context index to byte offset in `contexts`. "context_to_plaintext_offset": Dict[int, int], "plaintext" = json_dict["document_plaintext"] } """ add_candidate_types_and_positions(json_dict, max_position) for passage_answer in json_dict["passage_answer_candidates"]: if (passage_answer["plaintext_start_byte"] == -1 or passage_answer["plaintext_end_byte"] == -1): return {} # annotated_idx: index of the first annotated context, -1 if null. # annotated_min_ans: minimal answer start and end char offsets, # (-1, -1) if null. annotation, annotated_idx, annotated_min_ans = data.get_first_annotation( json_dict, max_passages) question = {"input_text": json_dict["question_text"]} answer = { "candidate_id": annotated_idx, "span_text": "", "span_start": -1, "span_end": -1, "input_text": "passage", } # Yes/no answers are added in the input text. if annotation is not None: assert annotation["yes_no_answer"] in ("YES", "NO", "NONE") if annotation["yes_no_answer"] in ("YES", "NO"): answer["input_text"] = annotation["yes_no_answer"].lower() # Add a minimal answer if one was found. if annotated_min_ans != (-1, -1): answer["input_text"] = "minimal" span_text = data.get_candidate_text(json_dict, annotated_idx).text try: answer["span_text"] = data.byte_slice(span_text, annotated_min_ans[0], annotated_min_ans[1]) except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} # local (passage) byte offset answer["span_start"] = annotated_min_ans[0] answer["span_end"] = annotated_min_ans[1] try: expected_answer_text = data.get_text_span( json_dict, { "plaintext_start_byte": annotation["minimal_answer"]["plaintext_start_byte"], "plaintext_end_byte": annotation["minimal_answer"]["plaintext_end_byte"], }).text except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} if expected_answer_text != answer["span_text"]: error_message = ("Extracted answer did not match expected answer:" "'{}' vs '{}'".format(expected_answer_text, answer["span_text"])) if fail_on_invalid: raise ValueError(error_message) else: logging.warn(error_message) return {} # Add a passage answer if one was found elif annotation and annotation["passage_answer"]["candidate_index"] >= 0: answer["input_text"] = "passage" answer["span_text"] = data.get_candidate_text(json_dict, annotated_idx).text answer["span_start"] = 0 answer["span_end"] = data.byte_len(answer["span_text"]) context_idxs = [] context_list = [] for idx, _ in data.candidates_iter(json_dict): context = { "id": idx, "type": get_candidate_type_and_position(json_dict, idx) } # Get list of all byte positions of the candidate and its plaintext. # Unpack `TextSpan` tuple. context["text_map"], context["text"] = data.get_candidate_text( json_dict, idx) if not context["text"]: logging.error("ERROR: Found example with empty context %d.", idx) if fail_on_invalid: raise ValueError( "ERROR: Found example with empty context {}.".format(idx)) return {} context_idxs.append(idx) context_list.append(context) if len(context_list) >= max_passages: break # Assemble the entry to be returned. entry = { "name": json_dict["document_title"], "id": str(json_dict["example_id"]), "language": json_dict["language"], "question": question, "answer": answer, "has_correct_context": annotated_idx in context_idxs } all_contexts_with_tokens = [] # `offset` is a byte offset relative to `contexts` (concatenated candidate # passages with special tokens added). offset = 0 context_to_plaintext_offset = [] for idx, context in zip(context_idxs, context_list): special_token = "[ContextId={}]".format(context["id"]) all_contexts_with_tokens.append(special_token) context_to_plaintext_offset.append([-1] * data.byte_len(special_token)) # Account for the special token and its trailing space (due to the join # operation below) offset += data.byte_len(special_token) + 1 if context["id"] == annotated_idx: answer["span_start"] += offset answer["span_end"] += offset if context["text"]: all_contexts_with_tokens.append(context["text"]) # Account for the text and its trailing space (due to the join # operation below) offset += data.byte_len(context["text"]) + 1 context_to_plaintext_offset.append(context["text_map"]) else: if fail_on_invalid: raise ValueError("Found example with empty context.") # When we join the contexts together with spaces below, we'll add an extra # byte to each one, so we have to account for these by adding a -1 (no # assigned wordpiece) index at each *boundary*. It's easier to do this here # than above since we don't want to accidentally add extra indices after the # last context. context_to_plaintext_offset = functools.reduce(lambda a, b: a + [-1] + b, context_to_plaintext_offset) entry["contexts"] = " ".join(all_contexts_with_tokens) entry["context_to_plaintext_offset"] = context_to_plaintext_offset entry["plaintext"] = json_dict["document_plaintext"] if annotated_idx in context_idxs: try: expected = data.byte_slice(entry["contexts"], answer["span_start"], answer["span_end"]) except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} # This is a sanity check to ensure that the calculated start and end # indices match the reported span text. If this assert fails, it is likely # a bug in the data preparation code above. (expected, answer["span_text"]) if expected != answer["span_text"]: logging.warn("*** pruned example id: %d ***", json_dict["example_id"]) logging.warn("*** %s, %s ***", expected, answer["span_text"]) return {} return entry
def create_entry_from_json(json_obj, max_passages=45, max_position=45): entry = { 'document_title': json_obj['document_title'], 'id': json_obj['example_id'], 'language': json_obj['language'], 'question': json_obj['question_text'] } annotation, candidate_idx, annotated_start_end = data.get_first_annotation( json_obj, max_passages) answer = { 'candidate_id': candidate_idx, 'type': 'passage', 'span': '', 'start': -1, 'end': -1 } # if annotated if annotation is not None: # if Yes/no answers, added in type. if annotation['yes_no_answer'] != 'NONE': answer['type'] = annotation['yes_no_answer'].lower() # if has minimal answer span if annotated_start_end != (-1, -1): answer['type'] = 'minimal' start = annotated_start_end[0] end = annotated_start_end[1] text = data.get_candidate_text(json_obj, candidate_idx).text answer['span'] = data.byte_slice(text, start, end) answer['start'] = start answer['end'] = end # passage selected if annotation['passage_answer']['candidate_index'] >= 0: answer['span'] = data.get_candidate_text(json_obj, candidate_idx).text answer['start'] = 0 answer['end'] = data.byte_len(answer['span']) entry['answer'] = answer paragraph_idx = [] paragraph_context = [] # add candidate paragraph types and positions # ct = 0 # for _, candidate in data.candidates_iter(json_obj): # if ct < max_position: # ct += 1 # candidate["type_and_position"] = "[Paragraph=%d]" % ct # else: break for idx, _ in data.candidates_iter(json_obj): res = data.get_candidate_text(json_obj, idx) context = { "id": idx, # "type": "[NoLongAnswer]" if idx == -1 else json_obj["passage_answer_candidates"][idx]["type_and_position"], "text_range": res[0], "text": res[1] } # Get list of all byte positions of the candidate and its plaintext. # Unpack `TextSpan` tuple. paragraph_idx.append(idx) paragraph_context.append(context) if len(paragraph_idx) >= max_passages: break # entry['has_correct_context'] = candidate_idx in paragraph_idx all_contexts_with_tokens = [] offset = 0 # a byte offset relative to `contexts` (concatenated candidate passages with special tokens added). context_to_plaintext_offset = [] for idx, context in zip(paragraph_idx, paragraph_context): special_token = "[ContextId={}]".format(idx) all_contexts_with_tokens.append(special_token) context_to_plaintext_offset.append([-1] * data.byte_len(special_token)) # Account for the special token and its trailing space (due to the join # operation below) offset += data.byte_len(special_token) + 1 if context["id"] == candidate_idx: answer["start"] += offset answer["end"] += offset if context["text"]: all_contexts_with_tokens.append(context["text"]) # Account for the text and its trailing space (due to the join operation below) offset += data.byte_len(context["text"]) + 1 context_to_plaintext_offset.append(context["text_range"]) # When we join the contexts together with spaces below, we'll add an extra # byte to each one, so we have to account for these by adding a -1 (no # assigned wordpiece) index at each *boundary*. It's easier to do this here # than above since we don't want to accidentally add extra indices after the # last context. context_to_plaintext_offset = functools.reduce( lambda a, b: a + [-1] + b, context_to_plaintext_offset) entry["contexts"] = " ".join(all_contexts_with_tokens) entry["context_to_plaintext_offset"] = context_to_plaintext_offset entry["plaintext"] = json_obj["document_plaintext"] return entry