Exemple #1
0
def add_candidate_types_and_positions(json_dict, max_position):
  """Adds type and position info to each candidate in the document."""
  count = 0
  for _, cand in data.candidates_iter(json_dict):
    if count < max_position:
      count += 1
    cand["type_and_position"] = "[Paragraph=%d]" % count
Exemple #2
0
def create_entry_from_json(json_dict, max_passages, max_position,
                           fail_on_invalid):
  """Creates an TyDi 'entry' from the raw JSON.

  The 'TyDiEntry' dict is an intermediate format that is later converted into
  the main `TyDiExample` format.

  This function looks up the chunks of text that are candidates for the passage
  answer task, inserts special context tokens such as "[ContextId=0]", and
  creates a byte index to byte index mapping between the document plaintext
  and the concatenation of the passage candidates (these could potentially
  exclude parts of the plaintext document and also include the special tokens).

  In the returned entry, `contexts` includes only the candidate passages and
  has special tokens such as [ContextId=0] added. `span_start` and `span_end`
  are byte-wise indices into `contexts` (not the original corpus plaintext).

  Args:
    json_dict: A single JSONL line, deserialized into a dict.
    max_passages: see FLAGS.max_passages.
    max_position: see FLAGS.max_position.
    fail_on_invalid: Immediately stop if an error is found?

  Returns:
    If a failure was encountered and `fail_on_invalid=False`, then returns
    an empty `dict`. Otherwise returns:
    'TyDiEntry' type: a dict-based format consumed by downstream functions:
    entry = {
        "name": str,
        "id": str,
        "language": str,
        "question": {"input_text": str},
        "answer": {
          "candidate_id": annotated_idx,
          "span_text": "",
          "span_start": -1,
          "span_end": -1,
          "input_text": "passage",
        }
        "has_correct_context": bool,
        # Includes special tokens appended.
        "contexts": str,
        # Context index to byte offset in `contexts`.
        "context_to_plaintext_offset": Dict[int, int],
        "plaintext" = json_dict["document_plaintext"]
    }
  """

  add_candidate_types_and_positions(json_dict, max_position)
  for passage_answer in json_dict["passage_answer_candidates"]:
    if (passage_answer["plaintext_start_byte"] == -1 or
        passage_answer["plaintext_end_byte"] == -1):
      return {}

  # annotated_idx: index of the first annotated context, -1 if null.
  # annotated_min_ans: minimal answer start and end char offsets,
  #                    (-1, -1) if null.
  annotation, annotated_idx, annotated_min_ans = data.get_first_annotation(
      json_dict, max_passages)
  question = {"input_text": json_dict["question_text"]}
  answer = {
      "candidate_id": annotated_idx,
      "span_text": "",
      "span_start": -1,
      "span_end": -1,
      "input_text": "passage",
  }

  # Yes/no answers are added in the input text.
  if annotation is not None:
    assert annotation["yes_no_answer"] in ("YES", "NO", "NONE")
    if annotation["yes_no_answer"] in ("YES", "NO"):
      answer["input_text"] = annotation["yes_no_answer"].lower()

  # Add a minimal answer if one was found.
  if annotated_min_ans != (-1, -1):
    answer["input_text"] = "minimal"
    span_text = data.get_candidate_text(json_dict, annotated_idx).text

    try:
      answer["span_text"] = data.byte_slice(span_text, annotated_min_ans[0],
                                            annotated_min_ans[1])
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    # local (passage) byte offset
    answer["span_start"] = annotated_min_ans[0]
    answer["span_end"] = annotated_min_ans[1]
    try:
      expected_answer_text = data.get_text_span(
          json_dict, {
              "plaintext_start_byte":
                  annotation["minimal_answer"]["plaintext_start_byte"],
              "plaintext_end_byte":
                  annotation["minimal_answer"]["plaintext_end_byte"],
          }).text
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    if expected_answer_text != answer["span_text"]:
      error_message = ("Extracted answer did not match expected answer:"
                       "'{}' vs '{}'".format(expected_answer_text,
                                             answer["span_text"]))
      if fail_on_invalid:
        raise ValueError(error_message)
      else:
        logging.warn(error_message)
        return {}

  # Add a passage answer if one was found
  elif annotation and annotation["passage_answer"]["candidate_index"] >= 0:
    answer["input_text"] = "passage"
    answer["span_text"] = data.get_candidate_text(json_dict, annotated_idx).text
    answer["span_start"] = 0
    answer["span_end"] = data.byte_len(answer["span_text"])

  context_idxs = []
  context_list = []
  for idx, _ in data.candidates_iter(json_dict):
    context = {
        "id": idx,
        "type": get_candidate_type_and_position(json_dict, idx)
    }
    # Get list of all byte positions of the candidate and its plaintext.
    # Unpack `TextSpan` tuple.
    context["text_map"], context["text"] = data.get_candidate_text(
        json_dict, idx)
    if not context["text"]:
      logging.error("ERROR: Found example with empty context %d.", idx)
      if fail_on_invalid:
        raise ValueError(
            "ERROR: Found example with empty context {}.".format(idx))
      return {}
    context_idxs.append(idx)
    context_list.append(context)
    if len(context_list) >= max_passages:
      break

  # Assemble the entry to be returned.
  entry = {
      "name": json_dict["document_title"],
      "id": str(json_dict["example_id"]),
      "language": json_dict["language"],
      "question": question,
      "answer": answer,
      "has_correct_context": annotated_idx in context_idxs
  }
  all_contexts_with_tokens = []
  # `offset` is a byte offset relative to `contexts` (concatenated candidate
  # passages with special tokens added).
  offset = 0
  context_to_plaintext_offset = []
  for idx, context in zip(context_idxs, context_list):
    special_token = "[ContextId={}]".format(context["id"])
    all_contexts_with_tokens.append(special_token)
    context_to_plaintext_offset.append([-1] * data.byte_len(special_token))
    # Account for the special token and its trailing space (due to the join
    # operation below)
    offset += data.byte_len(special_token) + 1

    if context["id"] == annotated_idx:
      answer["span_start"] += offset
      answer["span_end"] += offset
    if context["text"]:
      all_contexts_with_tokens.append(context["text"])
      # Account for the text and its trailing space (due to the join
      # operation below)
      offset += data.byte_len(context["text"]) + 1
      context_to_plaintext_offset.append(context["text_map"])
    else:
      if fail_on_invalid:
        raise ValueError("Found example with empty context.")

  # When we join the contexts together with spaces below, we'll add an extra
  # byte to each one, so we have to account for these by adding a -1 (no
  # assigned wordpiece) index at each *boundary*. It's easier to do this here
  # than above since we don't want to accidentally add extra indices after the
  # last context.
  context_to_plaintext_offset = functools.reduce(lambda a, b: a + [-1] + b,
                                                 context_to_plaintext_offset)

  entry["contexts"] = " ".join(all_contexts_with_tokens)
  entry["context_to_plaintext_offset"] = context_to_plaintext_offset
  entry["plaintext"] = json_dict["document_plaintext"]

  if annotated_idx in context_idxs:
    try:
      expected = data.byte_slice(entry["contexts"], answer["span_start"],
                                 answer["span_end"])
    except UnicodeDecodeError:
      logging.error("UnicodeDecodeError for example: %s",
                    json_dict["example_id"])
      if fail_on_invalid:
        raise
      return {}
    # This is a sanity check to ensure that the calculated start and end
    # indices match the reported span text. If this assert fails, it is likely
    # a bug in the data preparation code above. (expected, answer["span_text"])
    if expected != answer["span_text"]:
      logging.warn("*** pruned example id: %d ***", json_dict["example_id"])
      logging.warn("*** %s, %s ***", expected, answer["span_text"])
      return {}
  return entry
def create_entry_from_json(json_obj, max_passages=45, max_position=45):
    entry = {
        'document_title': json_obj['document_title'],
        'id': json_obj['example_id'],
        'language': json_obj['language'],
        'question': json_obj['question_text']
    }

    annotation, candidate_idx, annotated_start_end = data.get_first_annotation(
        json_obj, max_passages)
    answer = {
        'candidate_id': candidate_idx,
        'type': 'passage',
        'span': '',
        'start': -1,
        'end': -1
    }
    # if annotated
    if annotation is not None:
        # if Yes/no answers, added in type.
        if annotation['yes_no_answer'] != 'NONE':
            answer['type'] = annotation['yes_no_answer'].lower()
        # if has minimal answer span
        if annotated_start_end != (-1, -1):
            answer['type'] = 'minimal'
            start = annotated_start_end[0]
            end = annotated_start_end[1]
            text = data.get_candidate_text(json_obj, candidate_idx).text
            answer['span'] = data.byte_slice(text, start, end)
            answer['start'] = start
            answer['end'] = end
        # passage selected
        if annotation['passage_answer']['candidate_index'] >= 0:
            answer['span'] = data.get_candidate_text(json_obj,
                                                     candidate_idx).text
            answer['start'] = 0
            answer['end'] = data.byte_len(answer['span'])
    entry['answer'] = answer

    paragraph_idx = []
    paragraph_context = []

    # add candidate paragraph types and positions
    # ct = 0
    # for _, candidate in data.candidates_iter(json_obj):
    #     if ct < max_position:
    #         ct += 1
    #         candidate["type_and_position"] = "[Paragraph=%d]" % ct
    #     else: break

    for idx, _ in data.candidates_iter(json_obj):
        res = data.get_candidate_text(json_obj, idx)
        context = {
            "id": idx,
            # "type": "[NoLongAnswer]" if idx == -1 else json_obj["passage_answer_candidates"][idx]["type_and_position"],
            "text_range": res[0],
            "text": res[1]
        }
        # Get list of all byte positions of the candidate and its plaintext.
        # Unpack `TextSpan` tuple.
        paragraph_idx.append(idx)
        paragraph_context.append(context)
        if len(paragraph_idx) >= max_passages:
            break
    # entry['has_correct_context'] = candidate_idx in paragraph_idx

    all_contexts_with_tokens = []
    offset = 0  # a byte offset relative to `contexts` (concatenated candidate passages with special tokens added).
    context_to_plaintext_offset = []
    for idx, context in zip(paragraph_idx, paragraph_context):
        special_token = "[ContextId={}]".format(idx)
        all_contexts_with_tokens.append(special_token)

        context_to_plaintext_offset.append([-1] * data.byte_len(special_token))
        # Account for the special token and its trailing space (due to the join
        # operation below)
        offset += data.byte_len(special_token) + 1

        if context["id"] == candidate_idx:
            answer["start"] += offset
            answer["end"] += offset
        if context["text"]:
            all_contexts_with_tokens.append(context["text"])
            # Account for the text and its trailing space (due to the join operation below)
            offset += data.byte_len(context["text"]) + 1
            context_to_plaintext_offset.append(context["text_range"])

        # When we join the contexts together with spaces below, we'll add an extra
        # byte to each one, so we have to account for these by adding a -1 (no
        # assigned wordpiece) index at each *boundary*. It's easier to do this here
        # than above since we don't want to accidentally add extra indices after the
        # last context.
    context_to_plaintext_offset = functools.reduce(
        lambda a, b: a + [-1] + b, context_to_plaintext_offset)

    entry["contexts"] = " ".join(all_contexts_with_tokens)
    entry["context_to_plaintext_offset"] = context_to_plaintext_offset
    entry["plaintext"] = json_obj["document_plaintext"]

    return entry