def test_byte_slice(self): # 작 -- 3 UTF-8 bytes s = "[Q] 작가는 만화를 그리기 시작했나요?" q = data.byte_slice(s, 0, 3) self.assertEqual(q, "[Q]") one_char = data.byte_slice(s, 4, 7) self.assertEqual(one_char, "작")
def log_debug_info(filename, line_no, entry, debug_info, reverse_vocab_table): """Logs `debug_info` for debugging purposes.""" # Enable when debugging experimental new things. extremely_verbose = False def sanitize_char(c): """Optionally normalize chars we don't want in log messages.""" # Don't like having too many newlines in your debugging log output? # Change this. remove_newlines = False if c == "\r": if remove_newlines: return " " return "\r" if c == "\n": if remove_newlines: return " " return "\n" return c def sanitize(s): return "".join(sanitize_char(c) for c in s) doc = entry["plaintext"] if "json" in debug_info: json_elem = debug_info["json"] else: json_elem = None logging.info("No 'json' key in `debug_info`.") if "tydi_example" in debug_info: tydi_example = debug_info["tydi_example"] else: tydi_example = None logging.info("No 'tydi_example' key in `debug_info`.") offset_to_wp = None doc_wp = None logging.info("=== Logging example %s:%d ===", filename, line_no) window = 20 for i in range(0, data.byte_len(entry["contexts"]), window): span_text = data.byte_slice(entry["contexts"], i, i + window, errors="replace") doc_offsets = entry["context_to_plaintext_offset"][i:i + window] # Now double-check that those doc offsets actually match the text we expect. recovered_doc = [ data.byte_slice(doc, i, i + 1, errors="replace") for i in doc_offsets if i != -1 ] if extremely_verbose: logging.info("context_to_doc: %d: %s (%s) %s", i, sanitize(span_text), " ".join(str(x) for x in doc_offsets), sanitize(recovered_doc)) for key, value in debug_info.items(): if key == "offset_to_wp": offset_to_wp = value continue # Convert wordpiece vocab IDs back into readable text. if is_int_list(value) and "wp_ids" in key: value = [reverse_vocab_table[word_id] for word_id in value] # Convert Unicode escapes to readable text. if is_unicode_list(value): value = [word.encode("utf-8") for word in value] if key == "all_doc_wp_ids": doc_wp = value # Represent lists as plaintext. if isinstance(value, list): value = " ".join(str(item) for item in value) value = str(value) logging.info("%s: %s", key, value) if offset_to_wp is not None: for i in range(0, data.byte_len(entry["contexts"]), window): wp_slice = [] for byte_offset in range(i, i + window): if byte_offset in offset_to_wp: wp_offset = offset_to_wp[byte_offset] wp_slice.append(doc_wp[wp_offset]) else: wp_slice.append("-1") context_slice = data.byte_slice(entry["contexts"], i, i + window, errors="replace") logging.info("context_to_wp: %d: %s (%s)", i, sanitize(context_slice), " ".join(str(x) for x in wp_slice)) if "searched_offset_to_wp" in debug_info: logging.info( "searched_offset_to_wp: %s", " ".join(str(i) for i in debug_info["searched_offset_to_wp"])) if json_elem: logging.info( "json.annotations[0].minimal_answer.plaintext_start_byte: %d", json_elem["annotations"][0]["minimal_answer"] ["plaintext_start_byte"]) logging.info( "json.annotations[0].minimal_answer.plaintext_end_byte: %d", json_elem["annotations"][0]["minimal_answer"] ["plaintext_end_byte"]) min_ans_sp = json_elem["annotations"][0]["minimal_answer"] min_ans_text = data.byte_slice(json_elem["document_plaintext"], min_ans_sp["plaintext_start_byte"], min_ans_sp["plaintext_end_byte"], errors="replace") min_ans_text_in_context = data.byte_slice( json_elem["document_plaintext"], min_ans_sp["plaintext_start_byte"] - 100, min_ans_sp["plaintext_end_byte"] + 100, errors="replace") logging.info("minimal answer text (from json): %s", min_ans_text) logging.info("minimal answer text in context: %s", min_ans_text_in_context) logging.info("entry.answer.span_start: %d", entry["answer"]["span_start"]) logging.info("entry.answer.span_end: %d", entry["answer"]["span_end"]) logging.info("entry.answer.span_text: %s", entry["answer"]["span_text"]) if tydi_example: # Non-train examples may not have offsets. if tydi_example.start_byte_offset: logging.info("tydi_example.start_byte_offset: %d", tydi_example.start_byte_offset) logging.info("tydi_example.end_byte_offset: %d", tydi_example.end_byte_offset) tydi_example_min_ans_text = data.byte_slice( entry["contexts"], tydi_example.start_byte_offset, tydi_example.end_byte_offset, errors="replace") logging.info( "minimal answer text (from TyDiExample byte offsets in `contexts`): %s", tydi_example_min_ans_text) logging.info("^^^ End example ^^^")
def create_entry_from_json(json_dict, max_passages, max_position, fail_on_invalid): """Creates an TyDi 'entry' from the raw JSON. The 'TyDiEntry' dict is an intermediate format that is later converted into the main `TyDiExample` format. This function looks up the chunks of text that are candidates for the passage answer task, inserts special context tokens such as "[ContextId=0]", and creates a byte index to byte index mapping between the document plaintext and the concatenation of the passage candidates (these could potentially exclude parts of the plaintext document and also include the special tokens). In the returned entry, `contexts` includes only the candidate passages and has special tokens such as [ContextId=0] added. `span_start` and `span_end` are byte-wise indices into `contexts` (not the original corpus plaintext). Args: json_dict: A single JSONL line, deserialized into a dict. max_passages: see FLAGS.max_passages. max_position: see FLAGS.max_position. fail_on_invalid: Immediately stop if an error is found? Returns: If a failure was encountered and `fail_on_invalid=False`, then returns an empty `dict`. Otherwise returns: 'TyDiEntry' type: a dict-based format consumed by downstream functions: entry = { "name": str, "id": str, "language": str, "question": {"input_text": str}, "answer": { "candidate_id": annotated_idx, "span_text": "", "span_start": -1, "span_end": -1, "input_text": "passage", } "has_correct_context": bool, # Includes special tokens appended. "contexts": str, # Context index to byte offset in `contexts`. "context_to_plaintext_offset": Dict[int, int], "plaintext" = json_dict["document_plaintext"] } """ add_candidate_types_and_positions(json_dict, max_position) for passage_answer in json_dict["passage_answer_candidates"]: if (passage_answer["plaintext_start_byte"] == -1 or passage_answer["plaintext_end_byte"] == -1): return {} # annotated_idx: index of the first annotated context, -1 if null. # annotated_min_ans: minimal answer start and end char offsets, # (-1, -1) if null. annotation, annotated_idx, annotated_min_ans = data.get_first_annotation( json_dict, max_passages) question = {"input_text": json_dict["question_text"]} answer = { "candidate_id": annotated_idx, "span_text": "", "span_start": -1, "span_end": -1, "input_text": "passage", } # Yes/no answers are added in the input text. if annotation is not None: assert annotation["yes_no_answer"] in ("YES", "NO", "NONE") if annotation["yes_no_answer"] in ("YES", "NO"): answer["input_text"] = annotation["yes_no_answer"].lower() # Add a minimal answer if one was found. if annotated_min_ans != (-1, -1): answer["input_text"] = "minimal" span_text = data.get_candidate_text(json_dict, annotated_idx).text try: answer["span_text"] = data.byte_slice(span_text, annotated_min_ans[0], annotated_min_ans[1]) except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} # local (passage) byte offset answer["span_start"] = annotated_min_ans[0] answer["span_end"] = annotated_min_ans[1] try: expected_answer_text = data.get_text_span( json_dict, { "plaintext_start_byte": annotation["minimal_answer"]["plaintext_start_byte"], "plaintext_end_byte": annotation["minimal_answer"]["plaintext_end_byte"], }).text except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} if expected_answer_text != answer["span_text"]: error_message = ("Extracted answer did not match expected answer:" "'{}' vs '{}'".format(expected_answer_text, answer["span_text"])) if fail_on_invalid: raise ValueError(error_message) else: logging.warn(error_message) return {} # Add a passage answer if one was found elif annotation and annotation["passage_answer"]["candidate_index"] >= 0: answer["input_text"] = "passage" answer["span_text"] = data.get_candidate_text(json_dict, annotated_idx).text answer["span_start"] = 0 answer["span_end"] = data.byte_len(answer["span_text"]) context_idxs = [] context_list = [] for idx, _ in data.candidates_iter(json_dict): context = { "id": idx, "type": get_candidate_type_and_position(json_dict, idx) } # Get list of all byte positions of the candidate and its plaintext. # Unpack `TextSpan` tuple. context["text_map"], context["text"] = data.get_candidate_text( json_dict, idx) if not context["text"]: logging.error("ERROR: Found example with empty context %d.", idx) if fail_on_invalid: raise ValueError( "ERROR: Found example with empty context {}.".format(idx)) return {} context_idxs.append(idx) context_list.append(context) if len(context_list) >= max_passages: break # Assemble the entry to be returned. entry = { "name": json_dict["document_title"], "id": str(json_dict["example_id"]), "language": json_dict["language"], "question": question, "answer": answer, "has_correct_context": annotated_idx in context_idxs } all_contexts_with_tokens = [] # `offset` is a byte offset relative to `contexts` (concatenated candidate # passages with special tokens added). offset = 0 context_to_plaintext_offset = [] for idx, context in zip(context_idxs, context_list): special_token = "[ContextId={}]".format(context["id"]) all_contexts_with_tokens.append(special_token) context_to_plaintext_offset.append([-1] * data.byte_len(special_token)) # Account for the special token and its trailing space (due to the join # operation below) offset += data.byte_len(special_token) + 1 if context["id"] == annotated_idx: answer["span_start"] += offset answer["span_end"] += offset if context["text"]: all_contexts_with_tokens.append(context["text"]) # Account for the text and its trailing space (due to the join # operation below) offset += data.byte_len(context["text"]) + 1 context_to_plaintext_offset.append(context["text_map"]) else: if fail_on_invalid: raise ValueError("Found example with empty context.") # When we join the contexts together with spaces below, we'll add an extra # byte to each one, so we have to account for these by adding a -1 (no # assigned wordpiece) index at each *boundary*. It's easier to do this here # than above since we don't want to accidentally add extra indices after the # last context. context_to_plaintext_offset = functools.reduce(lambda a, b: a + [-1] + b, context_to_plaintext_offset) entry["contexts"] = " ".join(all_contexts_with_tokens) entry["context_to_plaintext_offset"] = context_to_plaintext_offset entry["plaintext"] = json_dict["document_plaintext"] if annotated_idx in context_idxs: try: expected = data.byte_slice(entry["contexts"], answer["span_start"], answer["span_end"]) except UnicodeDecodeError: logging.error("UnicodeDecodeError for example: %s", json_dict["example_id"]) if fail_on_invalid: raise return {} # This is a sanity check to ensure that the calculated start and end # indices match the reported span text. If this assert fails, it is likely # a bug in the data preparation code above. (expected, answer["span_text"]) if expected != answer["span_text"]: logging.warn("*** pruned example id: %d ***", json_dict["example_id"]) logging.warn("*** %s, %s ***", expected, answer["span_text"]) return {} return entry
def test_offset_wp_mapping(self): """Test the mapping from wordpiece to plaintext offsets.""" testdata = os.path.join( FLAGS.test_srcdir, ".//" "small_gold_annotation.jsonl") vocab_file = self._get_vocab_file() examples = preproc.read_tydi_examples( testdata, is_training=False, max_passages=45, max_position=45, fail_on_invalid=False, open_fn=tf_io.gopen) vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) for tydi_example in examples: wordpieces, start_offsets, end_offsets, offset_to_wp = ( tokenizer.tokenize_with_offsets(tydi_example.contexts)) # Check invariants. for i in start_offsets: if i > 0: self.assertLess( i, len(tydi_example.context_to_plaintext_offset), "Expected start offset {} to be in `context_to_plaintext_offset` " "byte_len(contexts)={} Context@{}='{}' Have={}".format( i, data.byte_len(tydi_example.contexts), i, data.byte_slice( tydi_example.contexts, i, i + 100, errors="ignore").encode("utf8"), tydi_example.context_to_plaintext_offset)) for i in end_offsets: if i > 0: self.assertLess( i, len(tydi_example.context_to_plaintext_offset), "Expected end offset {} to be in `context_to_plaintext_offset` " "byte_len(contexts)={} Have={}".format( i, data.byte_len(tydi_example.contexts), tydi_example.context_to_plaintext_offset)) wp_start_offsets, wp_end_offsets = ( preproc.create_mapping(start_offsets, end_offsets, tydi_example.context_to_plaintext_offset)) wp_count = 0 for wp_s, wp_e in zip(wp_start_offsets, wp_end_offsets): if wp_s >= 0 or wp_e >= 0 and wp_count < 20: wp_txt = wordpieces[wp_count] if isinstance(wp_txt, str): if "##" not in wp_txt and wp_txt != "[UNK]": self.assertEqual(tydi_example.plaintext[wp_s:wp_e + 1], wp_txt) wp_count += 1 for offset in offset_to_wp: self.assertLess(offset, data.byte_len(tydi_example.contexts)) self.assertGreaterEqual(offset, 0) matching_wp = offset_to_wp[offset] if matching_wp == -1: continue if wp_end_offsets[matching_wp] == -1: continue if wp_start_offsets[matching_wp] == -1: continue self.assertGreaterEqual(wp_end_offsets[matching_wp], wp_start_offsets[matching_wp])
def create_entry_from_json(json_obj, max_passages=45, max_position=45): entry = { 'document_title': json_obj['document_title'], 'id': json_obj['example_id'], 'language': json_obj['language'], 'question': json_obj['question_text'] } annotation, candidate_idx, annotated_start_end = data.get_first_annotation( json_obj, max_passages) answer = { 'candidate_id': candidate_idx, 'type': 'passage', 'span': '', 'start': -1, 'end': -1 } # if annotated if annotation is not None: # if Yes/no answers, added in type. if annotation['yes_no_answer'] != 'NONE': answer['type'] = annotation['yes_no_answer'].lower() # if has minimal answer span if annotated_start_end != (-1, -1): answer['type'] = 'minimal' start = annotated_start_end[0] end = annotated_start_end[1] text = data.get_candidate_text(json_obj, candidate_idx).text answer['span'] = data.byte_slice(text, start, end) answer['start'] = start answer['end'] = end # passage selected if annotation['passage_answer']['candidate_index'] >= 0: answer['span'] = data.get_candidate_text(json_obj, candidate_idx).text answer['start'] = 0 answer['end'] = data.byte_len(answer['span']) entry['answer'] = answer paragraph_idx = [] paragraph_context = [] # add candidate paragraph types and positions # ct = 0 # for _, candidate in data.candidates_iter(json_obj): # if ct < max_position: # ct += 1 # candidate["type_and_position"] = "[Paragraph=%d]" % ct # else: break for idx, _ in data.candidates_iter(json_obj): res = data.get_candidate_text(json_obj, idx) context = { "id": idx, # "type": "[NoLongAnswer]" if idx == -1 else json_obj["passage_answer_candidates"][idx]["type_and_position"], "text_range": res[0], "text": res[1] } # Get list of all byte positions of the candidate and its plaintext. # Unpack `TextSpan` tuple. paragraph_idx.append(idx) paragraph_context.append(context) if len(paragraph_idx) >= max_passages: break # entry['has_correct_context'] = candidate_idx in paragraph_idx all_contexts_with_tokens = [] offset = 0 # a byte offset relative to `contexts` (concatenated candidate passages with special tokens added). context_to_plaintext_offset = [] for idx, context in zip(paragraph_idx, paragraph_context): special_token = "[ContextId={}]".format(idx) all_contexts_with_tokens.append(special_token) context_to_plaintext_offset.append([-1] * data.byte_len(special_token)) # Account for the special token and its trailing space (due to the join # operation below) offset += data.byte_len(special_token) + 1 if context["id"] == candidate_idx: answer["start"] += offset answer["end"] += offset if context["text"]: all_contexts_with_tokens.append(context["text"]) # Account for the text and its trailing space (due to the join operation below) offset += data.byte_len(context["text"]) + 1 context_to_plaintext_offset.append(context["text_range"]) # When we join the contexts together with spaces below, we'll add an extra # byte to each one, so we have to account for these by adding a -1 (no # assigned wordpiece) index at each *boundary*. It's easier to do this here # than above since we don't want to accidentally add extra indices after the # last context. context_to_plaintext_offset = functools.reduce( lambda a, b: a + [-1] + b, context_to_plaintext_offset) entry["contexts"] = " ".join(all_contexts_with_tokens) entry["context_to_plaintext_offset"] = context_to_plaintext_offset entry["plaintext"] = json_obj["document_plaintext"] return entry