Example #1
0
  def test_byte_slice(self):
    # 작 -- 3 UTF-8 bytes
    s = "[Q] 작가는 만화를 그리기 시작했나요?"
    q = data.byte_slice(s, 0, 3)
    self.assertEqual(q, "[Q]")

    one_char = data.byte_slice(s, 4, 7)
    self.assertEqual(one_char, "작")
Example #2
0
def log_debug_info(filename, line_no, entry, debug_info, reverse_vocab_table):
    """Logs `debug_info` for debugging purposes."""

    # Enable when debugging experimental new things.
    extremely_verbose = False

    def sanitize_char(c):
        """Optionally normalize chars we don't want in log messages."""
        # Don't like having too many newlines in your debugging log output?
        # Change this.
        remove_newlines = False
        if c == "\r":
            if remove_newlines:
                return " "
            return "\r"
        if c == "\n":
            if remove_newlines:
                return " "
            return "\n"
        return c

    def sanitize(s):
        return "".join(sanitize_char(c) for c in s)

    doc = entry["plaintext"]

    if "json" in debug_info:
        json_elem = debug_info["json"]
    else:
        json_elem = None
        logging.info("No 'json' key in `debug_info`.")

    if "tydi_example" in debug_info:
        tydi_example = debug_info["tydi_example"]
    else:
        tydi_example = None
        logging.info("No 'tydi_example' key in `debug_info`.")

    offset_to_wp = None
    doc_wp = None
    logging.info("=== Logging example %s:%d ===", filename, line_no)

    window = 20
    for i in range(0, data.byte_len(entry["contexts"]), window):
        span_text = data.byte_slice(entry["contexts"],
                                    i,
                                    i + window,
                                    errors="replace")
        doc_offsets = entry["context_to_plaintext_offset"][i:i + window]
        # Now double-check that those doc offsets actually match the text we expect.
        recovered_doc = [
            data.byte_slice(doc, i, i + 1, errors="replace")
            for i in doc_offsets if i != -1
        ]
        if extremely_verbose:
            logging.info("context_to_doc: %d: %s (%s) %s", i,
                         sanitize(span_text),
                         " ".join(str(x) for x in doc_offsets),
                         sanitize(recovered_doc))

    for key, value in debug_info.items():
        if key == "offset_to_wp":
            offset_to_wp = value
            continue
        # Convert wordpiece vocab IDs back into readable text.
        if is_int_list(value) and "wp_ids" in key:
            value = [reverse_vocab_table[word_id] for word_id in value]
        # Convert Unicode escapes to readable text.
        if is_unicode_list(value):
            value = [word.encode("utf-8") for word in value]

        if key == "all_doc_wp_ids":
            doc_wp = value

        # Represent lists as plaintext.
        if isinstance(value, list):
            value = " ".join(str(item) for item in value)
        value = str(value)
        logging.info("%s: %s", key, value)

    if offset_to_wp is not None:
        for i in range(0, data.byte_len(entry["contexts"]), window):
            wp_slice = []
            for byte_offset in range(i, i + window):
                if byte_offset in offset_to_wp:
                    wp_offset = offset_to_wp[byte_offset]
                    wp_slice.append(doc_wp[wp_offset])
                else:
                    wp_slice.append("-1")
            context_slice = data.byte_slice(entry["contexts"],
                                            i,
                                            i + window,
                                            errors="replace")
            logging.info("context_to_wp: %d: %s (%s)", i,
                         sanitize(context_slice),
                         " ".join(str(x) for x in wp_slice))

    if "searched_offset_to_wp" in debug_info:
        logging.info(
            "searched_offset_to_wp: %s",
            " ".join(str(i) for i in debug_info["searched_offset_to_wp"]))

    if json_elem:
        logging.info(
            "json.annotations[0].minimal_answer.plaintext_start_byte: %d",
            json_elem["annotations"][0]["minimal_answer"]
            ["plaintext_start_byte"])
        logging.info(
            "json.annotations[0].minimal_answer.plaintext_end_byte: %d",
            json_elem["annotations"][0]["minimal_answer"]
            ["plaintext_end_byte"])
        min_ans_sp = json_elem["annotations"][0]["minimal_answer"]
        min_ans_text = data.byte_slice(json_elem["document_plaintext"],
                                       min_ans_sp["plaintext_start_byte"],
                                       min_ans_sp["plaintext_end_byte"],
                                       errors="replace")
        min_ans_text_in_context = data.byte_slice(
            json_elem["document_plaintext"],
            min_ans_sp["plaintext_start_byte"] - 100,
            min_ans_sp["plaintext_end_byte"] + 100,
            errors="replace")
    logging.info("minimal answer text (from json): %s", min_ans_text)
    logging.info("minimal answer text in context: %s", min_ans_text_in_context)

    logging.info("entry.answer.span_start: %d", entry["answer"]["span_start"])
    logging.info("entry.answer.span_end: %d", entry["answer"]["span_end"])
    logging.info("entry.answer.span_text: %s", entry["answer"]["span_text"])
    if tydi_example:
        # Non-train examples may not have offsets.
        if tydi_example.start_byte_offset:
            logging.info("tydi_example.start_byte_offset: %d",
                         tydi_example.start_byte_offset)
            logging.info("tydi_example.end_byte_offset: %d",
                         tydi_example.end_byte_offset)
            tydi_example_min_ans_text = data.byte_slice(
                entry["contexts"],
                tydi_example.start_byte_offset,
                tydi_example.end_byte_offset,
                errors="replace")
            logging.info(
                "minimal answer text (from TyDiExample byte offsets in `contexts`): %s",
                tydi_example_min_ans_text)
    logging.info("^^^ End example ^^^")
Example #3
0
def create_entry_from_json(json_dict, max_passages, max_position,
                           fail_on_invalid):
  """Creates an TyDi 'entry' from the raw JSON.

  The 'TyDiEntry' dict is an intermediate format that is later converted into
  the main `TyDiExample` format.

  This function looks up the chunks of text that are candidates for the passage
  answer task, inserts special context tokens such as "[ContextId=0]", and
  creates a byte index to byte index mapping between the document plaintext
  and the concatenation of the passage candidates (these could potentially
  exclude parts of the plaintext document and also include the special tokens).

  In the returned entry, `contexts` includes only the candidate passages and
  has special tokens such as [ContextId=0] added. `span_start` and `span_end`
  are byte-wise indices into `contexts` (not the original corpus plaintext).

  Args:
    json_dict: A single JSONL line, deserialized into a dict.
    max_passages: see FLAGS.max_passages.
    max_position: see FLAGS.max_position.
    fail_on_invalid: Immediately stop if an error is found?

  Returns:
    If a failure was encountered and `fail_on_invalid=False`, then returns
    an empty `dict`. Otherwise returns:
    'TyDiEntry' type: a dict-based format consumed by downstream functions:
    entry = {
        "name": str,
        "id": str,
        "language": str,
        "question": {"input_text": str},
        "answer": {
          "candidate_id": annotated_idx,
          "span_text": "",
          "span_start": -1,
          "span_end": -1,
          "input_text": "passage",
        }
        "has_correct_context": bool,
        # Includes special tokens appended.
        "contexts": str,
        # Context index to byte offset in `contexts`.
        "context_to_plaintext_offset": Dict[int, int],
        "plaintext" = json_dict["document_plaintext"]
    }
  """

  add_candidate_types_and_positions(json_dict, max_position)
  for passage_answer in json_dict["passage_answer_candidates"]:
    if (passage_answer["plaintext_start_byte"] == -1 or
        passage_answer["plaintext_end_byte"] == -1):
      return {}

  # annotated_idx: index of the first annotated context, -1 if null.
  # annotated_min_ans: minimal answer start and end char offsets,
  #                    (-1, -1) if null.
  annotation, annotated_idx, annotated_min_ans = data.get_first_annotation(
      json_dict, max_passages)
  question = {"input_text": json_dict["question_text"]}
  answer = {
      "candidate_id": annotated_idx,
      "span_text": "",
      "span_start": -1,
      "span_end": -1,
      "input_text": "passage",
  }

  # Yes/no answers are added in the input text.
  if annotation is not None:
    assert annotation["yes_no_answer"] in ("YES", "NO", "NONE")
    if annotation["yes_no_answer"] in ("YES", "NO"):
      answer["input_text"] = annotation["yes_no_answer"].lower()

  # Add a minimal answer if one was found.
  if annotated_min_ans != (-1, -1):
    answer["input_text"] = "minimal"
    span_text = data.get_candidate_text(json_dict, annotated_idx).text

    try:
      answer["span_text"] = data.byte_slice(span_text, annotated_min_ans[0],
                                            annotated_min_ans[1])
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    # local (passage) byte offset
    answer["span_start"] = annotated_min_ans[0]
    answer["span_end"] = annotated_min_ans[1]
    try:
      expected_answer_text = data.get_text_span(
          json_dict, {
              "plaintext_start_byte":
                  annotation["minimal_answer"]["plaintext_start_byte"],
              "plaintext_end_byte":
                  annotation["minimal_answer"]["plaintext_end_byte"],
          }).text
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    if expected_answer_text != answer["span_text"]:
      error_message = ("Extracted answer did not match expected answer:"
                       "'{}' vs '{}'".format(expected_answer_text,
                                             answer["span_text"]))
      if fail_on_invalid:
        raise ValueError(error_message)
      else:
        logging.warn(error_message)
        return {}

  # Add a passage answer if one was found
  elif annotation and annotation["passage_answer"]["candidate_index"] >= 0:
    answer["input_text"] = "passage"
    answer["span_text"] = data.get_candidate_text(json_dict, annotated_idx).text
    answer["span_start"] = 0
    answer["span_end"] = data.byte_len(answer["span_text"])

  context_idxs = []
  context_list = []
  for idx, _ in data.candidates_iter(json_dict):
    context = {
        "id": idx,
        "type": get_candidate_type_and_position(json_dict, idx)
    }
    # Get list of all byte positions of the candidate and its plaintext.
    # Unpack `TextSpan` tuple.
    context["text_map"], context["text"] = data.get_candidate_text(
        json_dict, idx)
    if not context["text"]:
      logging.error("ERROR: Found example with empty context %d.", idx)
      if fail_on_invalid:
        raise ValueError(
            "ERROR: Found example with empty context {}.".format(idx))
      return {}
    context_idxs.append(idx)
    context_list.append(context)
    if len(context_list) >= max_passages:
      break

  # Assemble the entry to be returned.
  entry = {
      "name": json_dict["document_title"],
      "id": str(json_dict["example_id"]),
      "language": json_dict["language"],
      "question": question,
      "answer": answer,
      "has_correct_context": annotated_idx in context_idxs
  }
  all_contexts_with_tokens = []
  # `offset` is a byte offset relative to `contexts` (concatenated candidate
  # passages with special tokens added).
  offset = 0
  context_to_plaintext_offset = []
  for idx, context in zip(context_idxs, context_list):
    special_token = "[ContextId={}]".format(context["id"])
    all_contexts_with_tokens.append(special_token)
    context_to_plaintext_offset.append([-1] * data.byte_len(special_token))
    # Account for the special token and its trailing space (due to the join
    # operation below)
    offset += data.byte_len(special_token) + 1

    if context["id"] == annotated_idx:
      answer["span_start"] += offset
      answer["span_end"] += offset
    if context["text"]:
      all_contexts_with_tokens.append(context["text"])
      # Account for the text and its trailing space (due to the join
      # operation below)
      offset += data.byte_len(context["text"]) + 1
      context_to_plaintext_offset.append(context["text_map"])
    else:
      if fail_on_invalid:
        raise ValueError("Found example with empty context.")

  # When we join the contexts together with spaces below, we'll add an extra
  # byte to each one, so we have to account for these by adding a -1 (no
  # assigned wordpiece) index at each *boundary*. It's easier to do this here
  # than above since we don't want to accidentally add extra indices after the
  # last context.
  context_to_plaintext_offset = functools.reduce(lambda a, b: a + [-1] + b,
                                                 context_to_plaintext_offset)

  entry["contexts"] = " ".join(all_contexts_with_tokens)
  entry["context_to_plaintext_offset"] = context_to_plaintext_offset
  entry["plaintext"] = json_dict["document_plaintext"]

  if annotated_idx in context_idxs:
    try:
      expected = data.byte_slice(entry["contexts"], answer["span_start"],
                                 answer["span_end"])
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    # This is a sanity check to ensure that the calculated start and end
    # indices match the reported span text. If this assert fails, it is likely
    # a bug in the data preparation code above. (expected, answer["span_text"])
    if expected != answer["span_text"]:
      logging.warn("*** pruned example id: %d ***", json_dict["example_id"])
      logging.warn("*** %s, %s ***", expected, answer["span_text"])
      return {}
  return entry
Example #4
0
  def test_offset_wp_mapping(self):
    """Test the mapping from wordpiece to plaintext offsets."""
    testdata = os.path.join(
        FLAGS.test_srcdir, ".//"
        "small_gold_annotation.jsonl")
    vocab_file = self._get_vocab_file()
    examples = preproc.read_tydi_examples(
        testdata,
        is_training=False,
        max_passages=45,
        max_position=45,
        fail_on_invalid=False,
        open_fn=tf_io.gopen)
    vocab_file = self._get_vocab_file()
    tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
    for tydi_example in examples:
      wordpieces, start_offsets, end_offsets, offset_to_wp = (
          tokenizer.tokenize_with_offsets(tydi_example.contexts))

      # Check invariants.
      for i in start_offsets:
        if i > 0:
          self.assertLess(
              i, len(tydi_example.context_to_plaintext_offset),
              "Expected start offset {} to be in `context_to_plaintext_offset` "
              "byte_len(contexts)={} Context@{}='{}' Have={}".format(
                  i, data.byte_len(tydi_example.contexts), i,
                  data.byte_slice(
                      tydi_example.contexts, i, i + 100,
                      errors="ignore").encode("utf8"),
                  tydi_example.context_to_plaintext_offset))
      for i in end_offsets:
        if i > 0:
          self.assertLess(
              i, len(tydi_example.context_to_plaintext_offset),
              "Expected end offset {} to be in `context_to_plaintext_offset` "
              "byte_len(contexts)={} Have={}".format(
                  i, data.byte_len(tydi_example.contexts),
                  tydi_example.context_to_plaintext_offset))

      wp_start_offsets, wp_end_offsets = (
          preproc.create_mapping(start_offsets, end_offsets,
                                 tydi_example.context_to_plaintext_offset))
      wp_count = 0
      for wp_s, wp_e in zip(wp_start_offsets, wp_end_offsets):
        if wp_s >= 0 or wp_e >= 0 and wp_count < 20:
          wp_txt = wordpieces[wp_count]
          if isinstance(wp_txt, str):
            if "##" not in wp_txt and wp_txt != "[UNK]":
              self.assertEqual(tydi_example.plaintext[wp_s:wp_e + 1], wp_txt)
        wp_count += 1

      for offset in offset_to_wp:
        self.assertLess(offset, data.byte_len(tydi_example.contexts))
        self.assertGreaterEqual(offset, 0)
        matching_wp = offset_to_wp[offset]
        if matching_wp == -1:
          continue
        if wp_end_offsets[matching_wp] == -1:
          continue
        if wp_start_offsets[matching_wp] == -1:
          continue
        self.assertGreaterEqual(wp_end_offsets[matching_wp],
                                wp_start_offsets[matching_wp])
def create_entry_from_json(json_obj, max_passages=45, max_position=45):
    entry = {
        'document_title': json_obj['document_title'],
        'id': json_obj['example_id'],
        'language': json_obj['language'],
        'question': json_obj['question_text']
    }

    annotation, candidate_idx, annotated_start_end = data.get_first_annotation(
        json_obj, max_passages)
    answer = {
        'candidate_id': candidate_idx,
        'type': 'passage',
        'span': '',
        'start': -1,
        'end': -1
    }
    # if annotated
    if annotation is not None:
        # if Yes/no answers, added in type.
        if annotation['yes_no_answer'] != 'NONE':
            answer['type'] = annotation['yes_no_answer'].lower()
        # if has minimal answer span
        if annotated_start_end != (-1, -1):
            answer['type'] = 'minimal'
            start = annotated_start_end[0]
            end = annotated_start_end[1]
            text = data.get_candidate_text(json_obj, candidate_idx).text
            answer['span'] = data.byte_slice(text, start, end)
            answer['start'] = start
            answer['end'] = end
        # passage selected
        if annotation['passage_answer']['candidate_index'] >= 0:
            answer['span'] = data.get_candidate_text(json_obj,
                                                     candidate_idx).text
            answer['start'] = 0
            answer['end'] = data.byte_len(answer['span'])
    entry['answer'] = answer

    paragraph_idx = []
    paragraph_context = []

    # add candidate paragraph types and positions
    # ct = 0
    # for _, candidate in data.candidates_iter(json_obj):
    #     if ct < max_position:
    #         ct += 1
    #         candidate["type_and_position"] = "[Paragraph=%d]" % ct
    #     else: break

    for idx, _ in data.candidates_iter(json_obj):
        res = data.get_candidate_text(json_obj, idx)
        context = {
            "id": idx,
            # "type": "[NoLongAnswer]" if idx == -1 else json_obj["passage_answer_candidates"][idx]["type_and_position"],
            "text_range": res[0],
            "text": res[1]
        }
        # Get list of all byte positions of the candidate and its plaintext.
        # Unpack `TextSpan` tuple.
        paragraph_idx.append(idx)
        paragraph_context.append(context)
        if len(paragraph_idx) >= max_passages:
            break
    # entry['has_correct_context'] = candidate_idx in paragraph_idx

    all_contexts_with_tokens = []
    offset = 0  # a byte offset relative to `contexts` (concatenated candidate passages with special tokens added).
    context_to_plaintext_offset = []
    for idx, context in zip(paragraph_idx, paragraph_context):
        special_token = "[ContextId={}]".format(idx)
        all_contexts_with_tokens.append(special_token)

        context_to_plaintext_offset.append([-1] * data.byte_len(special_token))
        # Account for the special token and its trailing space (due to the join
        # operation below)
        offset += data.byte_len(special_token) + 1

        if context["id"] == candidate_idx:
            answer["start"] += offset
            answer["end"] += offset
        if context["text"]:
            all_contexts_with_tokens.append(context["text"])
            # Account for the text and its trailing space (due to the join operation below)
            offset += data.byte_len(context["text"]) + 1
            context_to_plaintext_offset.append(context["text_range"])

        # When we join the contexts together with spaces below, we'll add an extra
        # byte to each one, so we have to account for these by adding a -1 (no
        # assigned wordpiece) index at each *boundary*. It's easier to do this here
        # than above since we don't want to accidentally add extra indices after the
        # last context.
    context_to_plaintext_offset = functools.reduce(
        lambda a, b: a + [-1] + b, context_to_plaintext_offset)

    entry["contexts"] = " ".join(all_contexts_with_tokens)
    entry["context_to_plaintext_offset"] = context_to_plaintext_offset
    entry["plaintext"] = json_obj["document_plaintext"]

    return entry